A library and framework for developing CPU-CUDA compatible applications under one unified code.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

311 lines
12 KiB

#ifndef MACROS_H
#define MACROS_H
#include <exception>
#include <sstream>
#include <stdarg.h>
#if defined(CUDA) && defined(__CUDACC__)
#define CUDACC
#endif
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
#define DEVICE
#endif
#ifdef CUDATOOLS_DOXYGEN
/**
* \def CUDACC
* This macro is defined when this code is being compiled by nvcc and the CUDA compilation
* flag is set. This should be used to enclose code where CUDA specific libraries and syntax are
* being used.
*/
#define CUDACC
/**
* \def DEVICE
* This macro is defined when this code is being compiled for the device. The difference between
* this and CUDACC is that this should exclusively be used to dcide if code is being compiled
* to execute on the device. CUDACC is only determines what compiler is being used.
*/
#define DEVICE
/**
* \def HD
* Mark a function in front with this if it needs to be callable on both the
* CPU and CUDA device.
*/
#define HD
/**
* \def DEVICE_FUNC
* Mark a function in front with this if it is only callable on the CUDA device.
*/
#define DEVICE_FUNC
/**
* \def SHARED
* Mark a variable as static shared memory.
*/
#define SHARED
/**
* \def CUDATOOLS_USE_EIGEN
* Compile the CudaTools library with Eigen support.
*/
#define CUDATOOLS_USE_EIGEN
/**
* \def CUDATOOLS_USE_PYTHON
* Compile the CudaTools library with Python support.
*/
#define CUDATOOLS_USE_PYTHON
/**
* \def KERNEL(call, settings, ...)
* Used to call a CUDA kernel.
* \param call the name of the kernel
* \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with
* \param ... the arguments of the kernel
*/
#define KERNEL(call, settings, ...)
/**
* \def BASIC_LOOP(N)
* Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly
* parallel situations. Exposes the loop/thread number as iThread.
* \param N number of iterations
*/
#define BASIC_LOOP(N)
/**
* \def DEVICE_COPY(name)
* Can be used inside a class declaration (header) which generates boilerplate code to allow this
* class to be used on the device.
*
* This macro creates a few functions:\n
* name* that(): returns the pointer to this instance on the device.
*
* void allocateDevice(): allocates the memory on the device for this class instance.
*
* void freeDevice(): frees the memory on the device for this class instance.
*
* CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance
* of the class.
*
* CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates
* the device instance of the class.
* \param name the name of the class
*/
#define DEVICE_COPY(name)
/**
* \def CT_ERROR_IF(a, op, b, msg)
* Used for throwing runtime errors given a condition with an operator.
*/
#define CT_ERROR_IF(a, op, b, msg)
/**
* \def CT_ERROR(a, msg)
* Used for throwing runtime errors given a bool.
*/
#define CT_ERROR(a, msg)
/**
* \def CUDA_CHECK(call)
* Gets the error generated by a CUDA function call if there is one.
* \param call CUDA function to check if there are errors when running.
*/
#define CUDA_CHECK(call)
/**
* \def CUBLAS_CHECK(call)
* Gets the error generated by a cuBLAS function call if there is one.
* \param call cuBLAS function to check if there are errors when running.
*/
#define CUBLAS_CHECK(call)
/**
* \def CUDA_MEM(call)
* Gets the GPU memory used from function call if there is one.
* \param call function to measure memory usage.
* \param name an identifier to use as a variable and when printing. Must satisfy variable naming.
*/
#define CUDA_MEM(call, name)
#endif
///////////////////
// KERNEL MACROS //
///////////////////
#ifdef CUDACC
#include <cublas_v2.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
//#include <cusparse.h>
#define DEVICE_FUNC __device__
#define HD __host__ __device__
#define SHARED __shared__
#define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
#else
#define DEVICE_FUNC
#define HD
#define SHARED
#define KERNEL(call, ...) void call(__VA_ARGS__)
#endif // CUDACC
///////////////////
// DEVICE MACROS //
///////////////////
#ifdef DEVICE
#define BASIC_LOOP(N) \
uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x; \
if (iThread < N)
#else
#define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread)
#endif
//////////////////
// CLASS MACROS //
//////////////////
#define UPDATE_FUNC(name) \
inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \
CudaTools::DEF_MEM_STREAM) { \
return CudaTools::copy(that(), this, sizeof(name)); \
}; \
inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \
CudaTools::DEF_MEM_STREAM) { \
return CudaTools::copy(this, that(), sizeof(name)); \
}
#ifdef CUDACC
#define DEVICE_COPY(name) \
private: \
name* __deviceInstance__ = nullptr; \
\
public: \
inline name* that() { return __deviceInstance__; }; \
inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \
inline void freeDevice() { CudaTools::free(__deviceInstance__); }; \
UPDATE_FUNC(name)
#else
#define DEVICE_COPY(name) \
public: \
inline name* that() { return this; }; \
inline void allocateDevice(){}; \
inline void freeDevice(){}; \
UPDATE_FUNC(name)
#endif
#ifndef CUDATOOLS_ARRAY_MAX_AXES
/**
* \def CUDATOOLS_ARRAY_MAX_AXES
* The maximum number of axes/dimensions an
* CudaTools::Array can have. The default is set
* to 4, but can be manully set fit the program
* needs.
*/
#define CUDATOOLS_ARRAY_MAX_AXES 4
#endif
////////////////////
// Error Checking //
////////////////////
#ifndef NO_DIMENSION_CHECK
#ifdef DEVICE
#define CT_ERROR_IF(a, op, b, msg) \
if (a op b) { \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", \
__FILE__, __LINE__, msg); \
}
#define CT_ERROR(a, msg) \
if (a) { \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
}
#else
#define CT_ERROR_IF(a, op, b, msg) \
if (a op b) { \
std::ostringstream os_a; \
std::ostringstream os_b; \
os_a << a; \
os_b << b; \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", \
__FILE__, __LINE__, msg, os_a.str().c_str(), os_b.str().c_str()); \
throw std::exception(); \
}
#define CT_ERROR(a, msg) \
if (a) { \
printf("\033[1;31m[CudaTools]\033[0m %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
throw std::exception(); \
}
#endif
#endif // NO_DIMENSION_CHECK
#if defined(CUDACC) && !defined(NO_CUDA_CHECK)
#define CUDA_CHECK(call) \
do { \
cudaError_t err = (call); \
if (err != cudaSuccess) { \
printf("\033[1;31m[CUDA]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
throw std::exception(); \
} \
} while (0)
#define CUBLAS_CHECK(call) \
do { \
cublasStatus_t err = (call); \
if (err != CUBLAS_STATUS_SUCCESS) { \
printf("\033[1;31m[cuBLAS]\033[0m %s:%d\n | %s\n", __FILE__, __LINE__, \
cublasGetStatusName(err)); \
throw std::exception(); \
} \
} while (0)
/*
#define CUSPARSE_CHECK(call) \
do { \
cusparseStatus_t err = (call); \
if (err != CUSPARSE_STATUS_SUCCESS) { \
printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err)); \
throw std::exception(); \
} \
} while (0)
*/
#define CUDA_MEM(call, name) \
size_t free_bef_##name, free_aft_##name; \
cudaMemGetInfo(&free_bef_##name, NULL); \
call; \
CudaTools::Manager::get()->sync(); \
cudaMemGetInfo(&free_aft_##name, NULL); \
printf("[%s] GPU Memory Usage: %iMiB\n", #name, \
(free_bef_##name - free_aft_##name) / (1024 * 1024));
#else
#define CUDA_CHECK(call) (call)
#define CUBLAS_CHECK(call) (call)
#define CUDA_MEM(call, name) (call)
#endif
#endif // MACROS_H