Added initial cuSparse groundwork, and fp16 (__half) GEMM function.

2 years ago · 0add15db92
parent 31916ed752
commit 0add15db92
11 changed files with 164 additions and 108 deletions
--- a/Array.h
+++ b/Array.h
@ -1,9 +1,9 @@
 #ifndef CUDATOOLS_ARRAY_H
 #define CUDATOOLS_ARRAY_H

-#include "Complex.h"
 #include "Core.h"
 #include "Macros.h"
+#include "Types.h"
 #include <Eigen/Dense>
 #include <cmath>
 #include <complex>
@ -18,10 +18,9 @@
 #define POINTER pHost
 #endif

-namespace CudaTools {
+using namespace CudaTools::Types;

-/** Type alises and lots of metaprogramming definitions, primarily dealing with
- * the different numeric types and overrides. */
+namespace CudaTools {

 template <typename T>
 using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
@ -32,23 +31,6 @@ template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
 template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
 template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;

-template <typename T> struct ComplexUnderlying_S { typedef T type; };
-template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
-template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
-template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
-
-template <typename T> struct ComplexConversion_S { typedef T type; };
-template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
-template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
-template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
-
-template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
-template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
-template <typename T>
-inline constexpr bool is_complex =
-    std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
-template <typename T> inline constexpr bool is_num = is_int<T> or is_float<T> or is_complex<T>;
-
 template <typename T> class Array;
 using Slice = std::pair<uint32_t, uint32_t>;

@ -576,7 +558,7 @@ template <typename T> class Array {
     * Sets the values of the entire Array to a constant. This is restricted to numerical types.
     */
    HD void setConstant(const T value) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = value;
        }
@ -588,7 +570,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    void setRandom(const T min, const T max) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        if constexpr (is_complex<T>) {
            CT_ERROR_IF(max.real(), <, min.real(),
                        "Upper bound of range cannot be larger than lower bound");
@ -623,7 +605,7 @@ template <typename T> class Array {
     * restricted to numerical types.
     */
    HD void setRange(T min, const T step = 1) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = min;
            min += step;
@ -650,7 +632,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    static Array constant(const Shape& shape, const T value) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        Array<T> arr(shape);
        arr.setConstant(value);
        return arr;
@ -662,7 +644,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    static Array random(const Shape& shape, const T min, const T max) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        Array<T> arr(shape);
        arr.setRandom(min, max);
        return arr;
@ -673,7 +655,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    static Array range(const T min, const T max, const T step = 1) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        Array<T> arr({(uint32_t)((max - min) / step)});
        arr.setRange(min, step);
@ -698,7 +680,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    Array transposed() const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr({mShape.rows(), mShape.cols()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
@ -711,7 +693,7 @@ template <typename T> class Array {
     * \brief Host only
     */
    void transpose() {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
--- a/BLAS.h
+++ b/BLAS.h
@ -2,16 +2,13 @@
 #define CUDATOOLS_BLAS_H

 #include "Array.h"
-#include "Complex.h"
 #include "Core.h"
 #include "Macros.h"
+#include "Types.h"

-#ifdef CUDACC
-#include <cuComplex.h>
-#endif
+using namespace CudaTools::Types;

 namespace CudaTools {
-
 namespace BLAS {

 struct BatchInfo {
@ -19,17 +16,20 @@ struct BatchInfo {
    uint32_t size;
 };

-template <typename T> struct Check {
+struct Check {
+    template <typename T>
    static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
        CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
    };

+    template <typename T>
    static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
        isAtLeast2D(arr, name);
        CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
    };

-    static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+    template <typename T, typename U, typename V>
+    static void isValidMatmul(const Array<T>& A, const Array<U>& B, const Array<V>& C,
                              const std::string& nameA = "A", const std::string& nameB = "B",
                              const std::string nameC = "C") {
        isAtLeast2D(A, nameA);
@ -46,7 +46,7 @@ template <typename T> struct Check {
            ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
    };

-    static uint32_t getUpperItems(const Array<T>& arr) {
+    template <typename T> static uint32_t getUpperItems(const Array<T>& arr) {
        uint32_t upperItems = 1;
        for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
            upperItems *= arr.shape().dim(iAxis);
@ -54,7 +54,8 @@ template <typename T> struct Check {
        return upperItems;
    };

-    static void matchUpperShape(const Array<T>& A, const Array<T>& B,
+    template <typename T, typename U>
+    static void matchUpperShape(const Array<T>& A, const Array<U>& B,
                                const std::string& nameA = "A", const std::string& nameB = "B") {
        CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
                    (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
@ -67,7 +68,8 @@ template <typename T> struct Check {
        }
    };

-    static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+    template <typename T, typename U, typename V>
+    static BatchInfo isBroadcastable(const Array<T>& A, const Array<U>& B, const Array<V>& C,
                                     const std::string& nameA = "A", const std::string& nameB = "B",
                                     const std::string nameC = "C") {
        isValidMatmul(A, B, C, nameA, nameB, nameC);
@ -130,7 +132,7 @@ template <typename T> class Batch {
    Batch(const Array<T>& arr) {
        CT_ERROR(arr.isView(), "Array cannot be a view");
        mShape = Shape({arr.shape().rows(), arr.shape().cols()});
-        mBatchSize = mCount = Check<T>::getUpperItems(arr);
+        mBatchSize = mCount = Check::getUpperItems(arr);

        mBatch = Array<T*>({mBatchSize});

@ -159,7 +161,7 @@ template <typename T> class Batch {
 #endif
        if (mCount == 0) {
            mShape = arr.shape();
-            mBatchSize = mCount = Check<T>::getUpperItems(arr);
+            mBatchSize = mCount = Check::getUpperItems(arr);
        } else {
            CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
        }
@ -195,15 +197,30 @@ template <typename T> struct CudaComplexConversion_S { typedef T type; };
 #ifdef CUDACC
 template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
 template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
+#else
+
 #endif

 template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;

+template <typename T> struct CublasTypeLetter_S { char letter; };
+template <> struct CublasTypeLetter_S<real32> { char letter = 'S'; };
+template <> struct CublasTypeLetter_S<real64> { char letter = 'D'; };
+template <> struct CublasTypeLetter_S<complex64> { char letter = 'C'; };
+template <> struct CublasTypeLetter_S<complex128> { char letter = 'Z'; };
+#ifdef CUDACC
+template <> struct CublasTypeLetter_S<real16> { char letter = 'H'; };
+#endif
+
+template <typename T> char CublasTypeLetter = CublasTypeLetter_S<T>::letter;
+
 // Shorthands to reduce clutter.

 #define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
 #define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)

+#define cublas(T, func) cublas##CublasTypeLetter<T>##func
+
 template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
 constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
    if constexpr (std::is_same<T, real32>::value) {
@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
    } else if constexpr (std::is_same<T, complex128>::value) {
        CUBLAS_CHECK(f4(args...));
    } else {
-        CT_ERROR(true, "BLAS functions are not callable with that type");
+        CT_ERROR(true, "This BLAS function is not callable with that type");
+    }
+}
+
+// If someone can think of a better solution, please tell me.
+template <typename T, typename F1, typename F2, typename F3, typename F4, typename F5,
+          typename... Args>
+constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) {
+    if constexpr (std::is_same<T, real32>::value) {
+        CUBLAS_CHECK(f1(args...));
+    } else if constexpr (std::is_same<T, real64>::value) {
+        CUBLAS_CHECK(f2(args...));
+    } else if constexpr (std::is_same<T, complex64>::value) {
+        CUBLAS_CHECK(f3(args...));
+    } else if constexpr (std::is_same<T, complex128>::value) {
+        CUBLAS_CHECK(f4(args...));
+    } else if constexpr (std::is_same<T, real16>::value) {
+        CUBLAS_CHECK(f5(args...));
+    } else {
+        CT_ERROR(true, "This BLAS function is not callable with that type");
    }
 }

@ -227,7 +263,7 @@ template <typename T>
 StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
              const StreamID& stream = DEF_CUBLAS_STREAM) {

-    BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
+    BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y");
    CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
    CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");

@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
                  CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
                  CAST(y.dataDevice()), 1);
-
    } else { // Greater than 2, so broadcast.
        invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
                  cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
 * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
 * broadcast the operation if applicable.
 */
-template <typename T>
-StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
+template <typename T, typename U, typename V>
+StreamID GEMM(const T alpha, const Array<U>& A, const Array<U>& B, const T beta, const Array<V>& C,
              const StreamID& stream = DEF_CUBLAS_STREAM) {

-    BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
+    BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C");
    // A is m x k, B is k x n.
    uint32_t m = A.shape().rows();
    uint32_t k = A.shape().cols();
@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
    T a = alpha, b = beta;
 #ifdef CUDA
    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
+
    if (bi.size == 1) {
-        invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm,
-                  Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
-                  CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
-                  m);
+        invoke5<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm,
+                   Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
+                   CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
+                   m);

    } else { // Greater than 2, so broadcast.
-        invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
-                  cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N,
-                  CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA,
-                  CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m,
-                  bi.strideC, bi.size);
+        invoke5<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
+                   cublasZgemmStridedBatched, cublasHgemmStridedBatched,
+                   Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
+                   CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB,
+                   CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size);
    }

 #else
@ -487,7 +523,7 @@ class PLUBatch : public Batch<T> {
     * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
     */
    PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
-        Check<T>::isSquare(arr, "LU Array");
+        Check::isSquare(arr, "LU Array");

        mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
        mInfoLU = Array<int32_t>({this->mBatchSize});
--- a/Core.h
+++ b/Core.h
@ -81,6 +81,7 @@ class Manager {
 #ifdef CUDACC
    std::unordered_map<std::string, cudaStream_t> mStreams;
    cublasHandle_t mCublas;
+    cusparseHandle_t mCusparse;
 #endif
  public:
    /**
@ -94,6 +95,7 @@ class Manager {
 #ifdef CUDACC
    cudaStream_t stream(const StreamID& stream) const;
    cublasHandle_t cublasHandle() const;
+    cusparseHandle_t cusparseHandle() const;
 #endif
 };

@ -391,6 +393,7 @@ Manager::Manager(const std::vector<std::string>& names) {
        addStream(name);
    }
    CUBLAS_CHECK(cublasCreate(&mCublas));
+    CUSPARSE_CHECK(cusparseCreate(&mCusparse));
 #endif
 }

@ -400,6 +403,7 @@ Manager::~Manager() {
        CUDA_CHECK(cudaStreamDestroy(it.second));
    }
    CUBLAS_CHECK(cublasDestroy(mCublas));
+    CUSPARSE_CHECK(cusparseDestroy(mCusparse));
 #endif
 }

@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const {
 }

 cublasHandle_t Manager::cublasHandle() const { return mCublas; };
+cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };

-Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
+Manager Manager::mManagerInstance =
+    Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"});
 #else
 Manager Manager::mManagerInstance = Manager({""});
 #endif
@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
    orig_stream.wait(*event);
 }

-#ifdef CUDACC
-const char* cublasGetErrorString(cublasStatus_t error) {
-    switch (error) {
-    case CUBLAS_STATUS_SUCCESS:
-        return "CUBLAS_STATUS_SUCCESS";
-
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-    case CUBLAS_STATUS_ALLOC_FAILED:
-        return "CUBLAS_STATUS_ALLOC_FAILED";
-
-    case CUBLAS_STATUS_INVALID_VALUE:
-        return "CUBLAS_STATUS_INVALID_VALUE";
-
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-    case CUBLAS_STATUS_MAPPING_ERROR:
-        return "CUBLAS_STATUS_MAPPING_ERROR";
-
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "CUBLAS_STATUS_INTERNAL_ERROR";
-    }
-
-    return "<unknown>";
-}
-#endif
 };     // namespace CudaTools
 #endif // CUDATOOLS_IMPLEMENTATION

--- a/Macros.h
+++ b/Macros.h
@ -9,9 +9,6 @@
 #define CUDACC
 #endif

-using real32 = float;  /**< Type alias for 32-bit floating point datatype. */
-using real64 = double; /**< Type alias for 64-bit floating point datatype. */
-
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
 #define DEVICE
 #endif
@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #ifdef CUDACC

 #include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#include <cusparse.h>

+#define DEVICE_FUNC __device__
 #define HD __host__ __device__
 #define SHARED __shared__

 #define KERNEL(call, ...) __global__ void call(__VA_ARGS__)

 #else
+#define DEVICE_FUNC
 #define HD
 #define SHARED

@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */

 #endif // CUDACC

-//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
-
 ///////////////////
 // DEVICE MACROS //
 ///////////////////
@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
    do {                                                                                           \
        cublasStatus_t err = (call);                                                               \
        if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
-            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__,                                  \
-                   CudaTools::cublasGetErrorString(err));                                          \
+            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err));       \
+            throw std::exception();                                                                \
+        }                                                                                          \
+    } while (0)
+
+#define CUSPARSE_CHECK(call)                                                                       \
+    do {                                                                                           \
+        cusparseStatus_t err = (call);                                                             \
+        if (err != CUSPARSE_STATUS_SUCCESS) {                                                      \
+            printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err));    \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
--- a/2
+++ b/2
@ -7,7 +7,7 @@ INCLUDE :=
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
-LIBS_GPU := cuda cudart cublas
+LIBS_GPU := cuda cudart cublas cusparse

 TARGET = tests
 SRC_DIR = .
--- a/Makefile.template
+++ b/Makefile.template
@ -7,7 +7,7 @@ INCLUDE := <<Put extra include directories here, separated by a space>>
 LIBS_DIR := <<Put library directories here, separated by a space>>
 LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
 LIBS := <<Put the names of the libraries here, separated by a space>>
-LIBS_GPU := cuda cudart cublas <<Put extra GPU libraries here, separated by a space>>
+LIBS_GPU := cuda cudart cublas cusparse <<Put extra GPU libraries here, separated by a space>>

 TARGET = <<Put the name of your target here>>
 SRC_DIR = .
--- a/Sparse.h
+++ b/Sparse.h
@ -0,0 +1,10 @@
+#ifndef CUDATOOLS_SPARSE_H
+#define CUDATOOLS_SPARSE_H
+
+#include "Array.h"
+#include "Core.h"
+#include "Macros.h"
+#include "Types.h"
+#endif
+
+#endif
--- a/Complex.h
+++ b/Complex.h
@ -11,6 +11,25 @@

 namespace CudaTools {

+namespace Types {
+
+using real32 = float;  /**< Type alias for 32-bit floating point datatype. */
+using real64 = double; /**< Type alias for 64-bit floating point datatype. */
+
+#ifdef CUDACC
+
+using real16 = __half;
+using realb16 = __nv_bfloat16;
+
+#else
+
+using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise,
+                         defaults to float. */
+using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise,
+                          defaults to float. */
+
+#endif // CUDACC
+
 template <typename T> class complex {
  private:
    T r = 0;
@ -107,11 +126,10 @@ template complex<real64> operator*<real64>(const real64, const complex<real64>);
 template complex<real32> operator/<real32>(const real32, const complex<real32>);
 template complex<real64> operator/<real64>(const real64, const complex<real64>);

-}; // namespace CudaTools
+#ifdef CUDACC
+using complex64 = complex<real32>;
+using complex128 = complex<real64>;

-#ifdef CUDA
-using complex64 = CudaTools::complex<real32>;
-using complex128 = CudaTools::complex<real64>;
 #else
 using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
                                         * This adapts depending on the CUDA compilation flag, and
@ -122,4 +140,27 @@ using complex128 =
                           * CudaTools::complex<real64>. */
 #endif

+/** Type alises and lots of metaprogramming definitions, primarily dealing with
+ * the different numeric types and overrides. */
+
+template <typename T> struct ComplexUnderlying_S { typedef T type; };
+template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
+template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
+template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
+
+template <typename T> struct ComplexConversion_S { typedef T type; };
+template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
+template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
+template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
+
+template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
+template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
+template <typename T>
+inline constexpr bool is_complex =
+    std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
+template <typename T> inline constexpr bool is_host_num = is_int<T> or is_float<T> or is_complex<T>;
+
+}; // namespace Types
+}; // namespace CudaTools
+
 #endif
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@ -21,7 +21,7 @@ for your own project, after following a few rules.

 The usage of this libary will be illustrated through examples, and further details
 can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
-Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
+Throughout this documentation, there are a few common terms that may appear. First, we refer to the CPU as the host, and the GPU as the device. So, a host function refers
 to a function runnable on the CPU, and a device function refers to a function that is runnable
 on a device. A kernel is a specific function that the host can call to be run on the device.

@ -42,17 +42,17 @@ macros provided. For example,
        return 0;
    }

-The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
+The ``KERNEL(name, ...)`` macro takes in the function name and its arguments.
 The second argument in the ``KERNEL()`` macro is are the launch parameters for
 kernel. The launch parameters have several items, but for 'embarassingly parallel'
-cases, we can simply generate the settings with the number of threads. More detail with
+cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with
 creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
 there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
 see :ref:`here <Macro Functions>`.

 .. warning::
   These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
-   for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
+   for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it
   and make it available to other files.

 Since many applications used classes, a macro is provided to 'convert' a class into
@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com
 mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.

 .. warning::
-   Notice that a view must be passed to the kernel, and not the original object. This
+   Notice that a view must be passed to the kernel, and not the original object, otherwise a copy
+   would be made.

 The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
 a few other functions.
--- a/samples/5_SimpleGraph/main.cu.cpp
+++ b/samples/5_SimpleGraph/main.cu.cpp
@ -90,6 +90,7 @@ int main() {
    CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
    CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);

+    // Executes process without graph.
    TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);

    std::cout << A.slice({{0, 10}}) << "\n";
@ -97,6 +98,7 @@ int main() {
    A.setConstant(50);
    B.setConstant(0);

+    // Executes process with graph.
    CudaTools::GraphManager gm;
    CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
    TIME(graph.execute().wait(), ExecuteGraph);
--- a/tests.cu.cpp
+++ b/tests.cu.cpp
@ -2,13 +2,14 @@
 #define CUDATOOLS_ARRAY_MAX_AXES 8
 #include "Array.h"
 #include "BLAS.h"
-#include "Complex.h"
 #include "Core.h"
+#include "Types.h"

 #include <Eigen/Core>
 #include <chrono>
 #include <complex>

+using namespace CudaTools::Types;
 namespace CT = CudaTools;

 /////////////