diff --git a/Macros.h b/Macros.h index ac08a14..b0a5c99 100644 --- a/Macros.h +++ b/Macros.h @@ -37,6 +37,12 @@ */ #define HD +/** + * \def DEVICE_FUNC + * Mark a function in front with this if it is only callable on the CUDA device. + */ +#define DEVICE_FUNC + /** * \def SHARED * Mark a variable as static shared memory. diff --git a/Types.h b/Types.h index c165b92..9650943 100644 --- a/Types.h +++ b/Types.h @@ -18,15 +18,15 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */ #ifdef CUDACC -using real16 = __half; -using realb16 = __nv_bfloat16; +using real16 = __half; /**< Type alias for 16-bit floating point datatype, when using GPU. + Otherwise, defaults to float. */ +using realb16 = __nv_bfloat16; /**< Type alias for the 16-bit bfloat datatype, when using GPU. + Otherwise, defaults to float. */ #else -using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise, - defaults to float. */ -using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise, - defaults to float. */ +using real16 = float; +using realb16 = float; #endif // CUDACC @@ -127,17 +127,17 @@ template complex operator/(const real32, const complex); template complex operator/(const real64, const complex); #ifdef CUDACC -using complex64 = complex; -using complex128 = complex; +using complex64 = complex; /**< Type alias for 64-bit complex floating point datatype. + * This adapts depending on the CUDA compilation flag, and + * will automatically switch std::complex. */ + +using complex128 = complex; /**< Type alias for 128-bit complex floating point datatype. + * This adapts depending on the CUDA compilation flag, and will + * automatically switch std::complex. */ #else -using complex64 = std::complex; /**< Type alias for 64-bit complex floating point datatype. - * This adapts depending on the CUDA compilation flag, and - * will automatically switch CudaTools::complex. */ -using complex128 = - std::complex; /**< Type alias for 128-bit complex floating point datatype. This adapts - * depending on the CUDA compilation flag, and will automatically switch - * CudaTools::complex. */ +using complex64 = std::complex; +using complex128 = std::complex; #endif /** Type alises and lots of metaprogramming definitions, primarily dealing with diff --git a/docs/source/core.rst b/docs/source/core.rst index 47f7aba..d06a78d 100644 --- a/docs/source/core.rst +++ b/docs/source/core.rst @@ -9,10 +9,21 @@ several classes to enable the usage of CUDA streams, kernels, and graphs. Types ===== -.. doxygentypedef:: real32 -.. doxygentypedef:: real64 -.. doxygentypedef:: complex64 -.. doxygentypedef:: complex128 +These numeric types are defined to faciliate the special types used for CUDA, +and is *necessary* to use them for functions to work properly. It is recommended +to bring them into the global namespace if possible, by writing ``using namespace CudaTools::Types;``. + +.. doxygentypedef:: CudaTools::Types::real32 +.. doxygentypedef:: CudaTools::Types::real64 +.. doxygentypedef:: CudaTools::Types::complex64 +.. doxygentypedef:: CudaTools::Types::complex128 + +These are types provided by the CUDA Math API, which cannot be easily used as computational +types on host code. Take care when transferring these back to host functions, as further +processing may require a type conversion. + +.. doxygentypedef:: CudaTools::Types::real16 +.. doxygentypedef:: CudaTools::Types::realb16 Macro Definitions ================= @@ -25,6 +36,7 @@ Device Indicators Host-Device Automation ---------------------- .. doxygendefine:: HD +.. doxygendefine:: DEVICE_FUNC .. doxygendefine:: SHARED Compilation Options diff --git a/docs/source/usage.rst b/docs/source/usage.rst index b2c38db..6833570 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -27,7 +27,7 @@ on a device. A kernel is a specific function that the host can call to be run on Core Examples ============= -This file mainly introduces compiler macros and a few classes that are used to improve the +This ``Core.h`` file mainly introduces compiler macros and a few classes that are used to improve the syntax between host and device code. To define and call a kernel, there are a few macros provided. For example,