#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "cuda.h" #define cuda_err_chk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=false) { if(code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(1); } } #define PRINT_ERROR \ do { \ fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \ __LINE__, __FILE__, errno, strerror(errno)); exit(1); \ } while(0) static std::chrono::time_point now() { return std::chrono::high_resolution_clock::now(); } /*Device function that returns how many SMs are there in the device/arch - it can be more than the maximum readable SMs*/ __device__ __forceinline__ unsigned int getnsmid(){ unsigned int r; asm("mov.u32 %0, %%nsmid;" : "=r"(r)); return r; } __device__ __forceinline__ unsigned int my_lanemask32_lt() { unsigned int lanemask32_lt; asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt)); return (lanemask32_lt); } /*Device function that returns the current SMID of for the block being run*/ __device__ __forceinline__ unsigned int getsmid(){ unsigned int r; asm("mov.u32 %0, %%smid;" : "=r"(r)); return r; } /*Device function that returns the current warpid of for the block being run*/ __device__ __forceinline__ unsigned int getwarpid(){ unsigned int r; asm("mov.u32 %0, %%warpid;" : "=r"(r)); return r; } __device__ __forceinline__ unsigned int getwarpsz() { unsigned int warpSize; asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warpSize)); return warpSize; } /*Device function that returns the current laneid of for the warp in the block being run*/ __device__ __forceinline__ unsigned int getlaneid(){ unsigned int r; asm("mov.u32 %0, %%laneid;" : "=r"(r)); return r; }