#ifndef MACROS_H #define MACROS_H #include #include #include #if defined(CUDA) && defined(__CUDACC__) #define CUDACC #endif #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) #define DEVICE #endif #ifdef CUDATOOLS_DOXYGEN /** * \def CUDACC * This macro is defined when this code is being compiled by nvcc and the CUDA compilation * flag is set. This should be used to enclose code where CUDA specific libraries and syntax are * being used. */ #define CUDACC /** * \def DEVICE * This macro is defined when this code is being compiled for the device. The difference between * this and CUDACC is that this should exclusively be used to dcide if code is being compiled * to execute on the device. CUDACC is only determines what compiler is being used. */ #define DEVICE /** * \def HD * Mark a function in front with this if it needs to be callable on both the * CPU and CUDA device. */ #define HD /** * \def DEVICE_FUNC * Mark a function in front with this if it is only callable on the CUDA device. */ #define DEVICE_FUNC /** * \def SHARED * Mark a variable as static shared memory. */ #define SHARED /** * \def CUDATOOLS_USE_EIGEN * Compile the CudaTools library with Eigen support. */ #define CUDATOOLS_USE_EIGEN /** * \def CUDATOOLS_USE_PYTHON * Compile the CudaTools library with Python support. */ #define CUDATOOLS_USE_PYTHON /** * \def KERNEL(call, settings, ...) * Used to call a CUDA kernel. * \param call the name of the kernel * \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with * \param ... the arguments of the kernel */ #define KERNEL(call, settings, ...) /** * \def BASIC_LOOP(N) * Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly * parallel situations. Exposes the loop/thread number as iThread. * \param N number of iterations */ #define BASIC_LOOP(N) /** * \def DEVICE_COPY(name) * Can be used inside a class declaration (header) which generates boilerplate code to allow this * class to be used on the device. * * This macro creates a few functions:\n * name* that(): returns the pointer to this instance on the device. * * void allocateDevice(): allocates the memory on the device for this class instance. * * void freeDevice(): frees the memory on the device for this class instance. * * CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance * of the class. * * CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates * the device instance of the class. * \param name the name of the class */ #define DEVICE_COPY(name) /** * \def CT_ERROR_IF(a, op, b, msg) * Used for throwing runtime errors given a condition with an operator. */ #define CT_ERROR_IF(a, op, b, msg) /** * \def CT_ERROR(a, msg) * Used for throwing runtime errors given a bool. */ #define CT_ERROR(a, msg) /** * \def CUDA_CHECK(call) * Gets the error generated by a CUDA function call if there is one. * \param call CUDA function to check if there are errors when running. */ #define CUDA_CHECK(call) /** * \def CUBLAS_CHECK(call) * Gets the error generated by a cuBLAS function call if there is one. * \param call cuBLAS function to check if there are errors when running. */ #define CUBLAS_CHECK(call) /** * \def CUDA_MEM(call) * Gets the GPU memory used from function call if there is one. * \param call function to measure memory usage. * \param name an identifier to use as a variable and when printing. Must satisfy variable naming. */ #define CUDA_MEM(call, name) #endif /////////////////// // KERNEL MACROS // /////////////////// #ifdef CUDACC #include #include #include #include //#include #define DEVICE_FUNC __device__ #define HD __host__ __device__ #define SHARED __shared__ #define KERNEL(call, ...) __global__ void call(__VA_ARGS__) #else #define DEVICE_FUNC #define HD #define SHARED #define KERNEL(call, ...) void call(__VA_ARGS__) #endif // CUDACC /////////////////// // DEVICE MACROS // /////////////////// #ifdef DEVICE #define BASIC_LOOP(N) \ uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x; \ if (iThread < N) #else #define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread) #endif ////////////////// // CLASS MACROS // ////////////////// #define UPDATE_FUNC(name) \ inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \ CudaTools::DEF_MEM_STREAM) { \ return CudaTools::copy(that(), this, sizeof(name)); \ }; \ inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \ CudaTools::DEF_MEM_STREAM) { \ return CudaTools::copy(this, that(), sizeof(name)); \ } #ifdef CUDACC #define DEVICE_COPY(name) \ private: \ name* __deviceInstance__ = nullptr; \ \ public: \ inline name* that() { return __deviceInstance__; }; \ inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \ inline void freeDevice() { CudaTools::free(__deviceInstance__); }; \ UPDATE_FUNC(name) #else #define DEVICE_COPY(name) \ public: \ inline name* that() { return this; }; \ inline void allocateDevice(){}; \ inline void freeDevice(){}; \ UPDATE_FUNC(name) #endif #ifndef CUDATOOLS_ARRAY_MAX_AXES /** * \def CUDATOOLS_ARRAY_MAX_AXES * The maximum number of axes/dimensions an * CudaTools::Array can have. The default is set * to 4, but can be manully set fit the program * needs. */ #define CUDATOOLS_ARRAY_MAX_AXES 4 #endif //////////////////// // Error Checking // //////////////////// #ifndef NO_DIMENSION_CHECK #ifdef DEVICE #define CT_ERROR_IF(a, op, b, msg) \ if (a op b) { \ printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", \ __FILE__, __LINE__, msg); \ } #define CT_ERROR(a, msg) \ if (a) { \ printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \ } #else #define CT_ERROR_IF(a, op, b, msg) \ if (a op b) { \ std::ostringstream os_a; \ std::ostringstream os_b; \ os_a << a; \ os_b << b; \ printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", \ __FILE__, __LINE__, msg, os_a.str().c_str(), os_b.str().c_str()); \ throw std::exception(); \ } #define CT_ERROR(a, msg) \ if (a) { \ printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \ throw std::exception(); \ } #endif #endif // NO_DIMENSION_CHECK #if defined(CUDACC) && !defined(NO_CUDA_CHECK) #define CUDA_CHECK(call) \ do { \ cudaError_t err = (call); \ if (err != cudaSuccess) { \ printf("\033[1;31m[CUDA]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \ cudaGetErrorString(err)); \ throw std::exception(); \ } \ } while (0) #define CUBLAS_CHECK(call) \ do { \ cublasStatus_t err = (call); \ if (err != CUBLAS_STATUS_SUCCESS) { \ printf("\033[1;31m[cuBLAS]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \ cublasGetStatusName(err)); \ throw std::exception(); \ } \ } while (0) /* #define CUSPARSE_CHECK(call) \ do { \ cusparseStatus_t err = (call); \ if (err != CUSPARSE_STATUS_SUCCESS) { \ printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err)); \ throw std::exception(); \ } \ } while (0) */ #define CUDA_MEM(call, name) \ size_t free_bef_##name, free_aft_##name; \ cudaMemGetInfo(&free_bef_##name, NULL); \ call; \ CudaTools::Manager::get()->sync(); \ cudaMemGetInfo(&free_aft_##name, NULL); \ printf("[%s] GPU Memory Usage: %iMiB\n", #name, \ (free_bef_##name - free_aft_##name) / (1024 * 1024)); #else #define CUDA_CHECK(call) (call) #define CUBLAS_CHECK(call) (call) #define CUDA_MEM(call, name) (call) #endif #endif // MACROS_H