diff --git a/Array.h b/Array.h
index 4a98369..efb8eee 100644
--- a/Array.h
+++ b/Array.h
@@ -1,9 +1,9 @@
 #ifndef CUDATOOLS_ARRAY_H
 #define CUDATOOLS_ARRAY_H
 
-#include "Complex.h"
 #include "Core.h"
 #include "Macros.h"
+#include "Types.h"
 #include <Eigen/Dense>
 #include <cmath>
 #include <complex>
@@ -18,10 +18,9 @@
 #define POINTER pHost
 #endif
 
-namespace CudaTools {
+using namespace CudaTools::Types;
 
-/** Type alises and lots of metaprogramming definitions, primarily dealing with
- * the different numeric types and overrides. */
+namespace CudaTools {
 
 template <typename T>
 using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
@@ -32,23 +31,6 @@ template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
 template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
 template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
 
-template <typename T> struct ComplexUnderlying_S { typedef T type; };
-template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
-template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
-template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
-
-template <typename T> struct ComplexConversion_S { typedef T type; };
-template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
-template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
-template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
-
-template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
-template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
-template <typename T>
-inline constexpr bool is_complex =
-    std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
-template <typename T> inline constexpr bool is_num = is_int<T> or is_float<T> or is_complex<T>;
-
 template <typename T> class Array;
 using Slice = std::pair<uint32_t, uint32_t>;
 
@@ -576,7 +558,7 @@ template <typename T> class Array {
      * Sets the values of the entire Array to a constant. This is restricted to numerical types.
      */
     HD void setConstant(const T value) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         for (auto it = begin(); it != end(); ++it) {
             *it = value;
         }
@@ -588,7 +570,7 @@ template <typename T> class Array {
      * \brief Host only
      */
     void setRandom(const T min, const T max) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         if constexpr (is_complex<T>) {
             CT_ERROR_IF(max.real(), <, min.real(),
                         "Upper bound of range cannot be larger than lower bound");
@@ -623,7 +605,7 @@ template <typename T> class Array {
      * restricted to numerical types.
      */
     HD void setRange(T min, const T step = 1) const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         for (auto it = begin(); it != end(); ++it) {
             *it = min;
             min += step;
@@ -650,7 +632,7 @@ template <typename T> class Array {
      * \brief Host only
      */
     static Array constant(const Shape& shape, const T value) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         Array<T> arr(shape);
         arr.setConstant(value);
         return arr;
@@ -662,7 +644,7 @@ template <typename T> class Array {
      * \brief Host only
      */
     static Array random(const Shape& shape, const T min, const T max) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         Array<T> arr(shape);
         arr.setRandom(min, max);
         return arr;
@@ -673,7 +655,7 @@ template <typename T> class Array {
      * \brief Host only
      */
     static Array range(const T min, const T max, const T step = 1) {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
         Array<T> arr({(uint32_t)((max - min) / step)});
         arr.setRange(min, step);
@@ -698,7 +680,7 @@ template <typename T> class Array {
      * \brief Host only
      */
     Array transposed() const {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
         Array<T> new_arr({mShape.rows(), mShape.cols()});
         new_arr.eigenMap() = this->eigenMap().transpose().eval();
@@ -711,7 +693,7 @@ template <typename T> class Array {
      * \brief Host only
      */
     void transpose() {
-        static_assert(is_num<T>, "Function only available on numeric types.");
+        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
         CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
         Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
         new_arr.eigenMap() = this->eigenMap().transpose().eval();
diff --git a/BLAS.h b/BLAS.h
index b270fb2..8369eb1 100644
--- a/BLAS.h
+++ b/BLAS.h
@@ -2,16 +2,13 @@
 #define CUDATOOLS_BLAS_H
 
 #include "Array.h"
-#include "Complex.h"
 #include "Core.h"
 #include "Macros.h"
+#include "Types.h"
 
-#ifdef CUDACC
-#include <cuComplex.h>
-#endif
+using namespace CudaTools::Types;
 
 namespace CudaTools {
-
 namespace BLAS {
 
 struct BatchInfo {
@@ -19,17 +16,20 @@ struct BatchInfo {
     uint32_t size;
 };
 
-template <typename T> struct Check {
+struct Check {
+    template <typename T>
     static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
         CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
     };
 
+    template <typename T>
     static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
         isAtLeast2D(arr, name);
         CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
     };
 
-    static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+    template <typename T, typename U, typename V>
+    static void isValidMatmul(const Array<T>& A, const Array<U>& B, const Array<V>& C,
                               const std::string& nameA = "A", const std::string& nameB = "B",
                               const std::string nameC = "C") {
         isAtLeast2D(A, nameA);
@@ -46,7 +46,7 @@ template <typename T> struct Check {
             ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
     };
 
-    static uint32_t getUpperItems(const Array<T>& arr) {
+    template <typename T> static uint32_t getUpperItems(const Array<T>& arr) {
         uint32_t upperItems = 1;
         for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
             upperItems *= arr.shape().dim(iAxis);
@@ -54,7 +54,8 @@ template <typename T> struct Check {
         return upperItems;
     };
 
-    static void matchUpperShape(const Array<T>& A, const Array<T>& B,
+    template <typename T, typename U>
+    static void matchUpperShape(const Array<T>& A, const Array<U>& B,
                                 const std::string& nameA = "A", const std::string& nameB = "B") {
         CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
                     (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
@@ -67,7 +68,8 @@ template <typename T> struct Check {
         }
     };
 
-    static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+    template <typename T, typename U, typename V>
+    static BatchInfo isBroadcastable(const Array<T>& A, const Array<U>& B, const Array<V>& C,
                                      const std::string& nameA = "A", const std::string& nameB = "B",
                                      const std::string nameC = "C") {
         isValidMatmul(A, B, C, nameA, nameB, nameC);
@@ -130,7 +132,7 @@ template <typename T> class Batch {
     Batch(const Array<T>& arr) {
         CT_ERROR(arr.isView(), "Array cannot be a view");
         mShape = Shape({arr.shape().rows(), arr.shape().cols()});
-        mBatchSize = mCount = Check<T>::getUpperItems(arr);
+        mBatchSize = mCount = Check::getUpperItems(arr);
 
         mBatch = Array<T*>({mBatchSize});
 
@@ -159,7 +161,7 @@ template <typename T> class Batch {
 #endif
         if (mCount == 0) {
             mShape = arr.shape();
-            mBatchSize = mCount = Check<T>::getUpperItems(arr);
+            mBatchSize = mCount = Check::getUpperItems(arr);
         } else {
             CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
         }
@@ -195,15 +197,30 @@ template <typename T> struct CudaComplexConversion_S { typedef T type; };
 #ifdef CUDACC
 template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
 template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
+#else
+
 #endif
 
 template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;
 
+template <typename T> struct CublasTypeLetter_S { char letter; };
+template <> struct CublasTypeLetter_S<real32> { char letter = 'S'; };
+template <> struct CublasTypeLetter_S<real64> { char letter = 'D'; };
+template <> struct CublasTypeLetter_S<complex64> { char letter = 'C'; };
+template <> struct CublasTypeLetter_S<complex128> { char letter = 'Z'; };
+#ifdef CUDACC
+template <> struct CublasTypeLetter_S<real16> { char letter = 'H'; };
+#endif
+
+template <typename T> char CublasTypeLetter = CublasTypeLetter_S<T>::letter;
+
 // Shorthands to reduce clutter.
 
 #define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
 #define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)
 
+#define cublas(T, func) cublas##CublasTypeLetter<T>##func
+
 template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
 constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
     if constexpr (std::is_same<T, real32>::value) {
@@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
     } else if constexpr (std::is_same<T, complex128>::value) {
         CUBLAS_CHECK(f4(args...));
     } else {
-        CT_ERROR(true, "BLAS functions are not callable with that type");
+        CT_ERROR(true, "This BLAS function is not callable with that type");
+    }
+}
+
+// If someone can think of a better solution, please tell me.
+template <typename T, typename F1, typename F2, typename F3, typename F4, typename F5,
+          typename... Args>
+constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) {
+    if constexpr (std::is_same<T, real32>::value) {
+        CUBLAS_CHECK(f1(args...));
+    } else if constexpr (std::is_same<T, real64>::value) {
+        CUBLAS_CHECK(f2(args...));
+    } else if constexpr (std::is_same<T, complex64>::value) {
+        CUBLAS_CHECK(f3(args...));
+    } else if constexpr (std::is_same<T, complex128>::value) {
+        CUBLAS_CHECK(f4(args...));
+    } else if constexpr (std::is_same<T, real16>::value) {
+        CUBLAS_CHECK(f5(args...));
+    } else {
+        CT_ERROR(true, "This BLAS function is not callable with that type");
     }
 }
 
@@ -227,7 +263,7 @@ template <typename T>
 StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
               const StreamID& stream = DEF_CUBLAS_STREAM) {
 
-    BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
+    BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y");
     CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
     CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
 
@@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
                   Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
                   CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
                   CAST(y.dataDevice()), 1);
-
     } else { // Greater than 2, so broadcast.
         invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
                   cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
@@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
  * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
  * broadcast the operation if applicable.
  */
-template <typename T>
-StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
+template <typename T, typename U, typename V>
+StreamID GEMM(const T alpha, const Array<U>& A, const Array<U>& B, const T beta, const Array<V>& C,
               const StreamID& stream = DEF_CUBLAS_STREAM) {
 
-    BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
+    BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C");
     // A is m x k, B is k x n.
     uint32_t m = A.shape().rows();
     uint32_t k = A.shape().cols();
@@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
     T a = alpha, b = beta;
 #ifdef CUDA
     CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
+
     if (bi.size == 1) {
-        invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm,
-                  Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
-                  CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
-                  m);
+        invoke5<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm,
+                   Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
+                   CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
+                   m);
 
     } else { // Greater than 2, so broadcast.
-        invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
-                  cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N,
-                  CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA,
-                  CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m,
-                  bi.strideC, bi.size);
+        invoke5<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
+                   cublasZgemmStridedBatched, cublasHgemmStridedBatched,
+                   Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
+                   CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB,
+                   CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size);
     }
 
 #else
@@ -487,7 +523,7 @@ class PLUBatch : public Batch<T> {
      * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
      */
     PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
-        Check<T>::isSquare(arr, "LU Array");
+        Check::isSquare(arr, "LU Array");
 
         mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
         mInfoLU = Array<int32_t>({this->mBatchSize});
diff --git a/Core.h b/Core.h
index 82df51b..f6f10f6 100644
--- a/Core.h
+++ b/Core.h
@@ -81,6 +81,7 @@ class Manager {
 #ifdef CUDACC
     std::unordered_map<std::string, cudaStream_t> mStreams;
     cublasHandle_t mCublas;
+    cusparseHandle_t mCusparse;
 #endif
   public:
     /**
@@ -94,6 +95,7 @@ class Manager {
 #ifdef CUDACC
     cudaStream_t stream(const StreamID& stream) const;
     cublasHandle_t cublasHandle() const;
+    cusparseHandle_t cusparseHandle() const;
 #endif
 };
 
@@ -391,6 +393,7 @@ Manager::Manager(const std::vector<std::string>& names) {
         addStream(name);
     }
     CUBLAS_CHECK(cublasCreate(&mCublas));
+    CUSPARSE_CHECK(cusparseCreate(&mCusparse));
 #endif
 }
 
@@ -400,6 +403,7 @@ Manager::~Manager() {
         CUDA_CHECK(cudaStreamDestroy(it.second));
     }
     CUBLAS_CHECK(cublasDestroy(mCublas));
+    CUSPARSE_CHECK(cusparseDestroy(mCusparse));
 #endif
 }
 
@@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const {
 }
 
 cublasHandle_t Manager::cublasHandle() const { return mCublas; };
+cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
 
-Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
+Manager Manager::mManagerInstance =
+    Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"});
 #else
 Manager Manager::mManagerInstance = Manager({""});
 #endif
@@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
     orig_stream.wait(*event);
 }
 
-#ifdef CUDACC
-const char* cublasGetErrorString(cublasStatus_t error) {
-    switch (error) {
-    case CUBLAS_STATUS_SUCCESS:
-        return "CUBLAS_STATUS_SUCCESS";
-
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-    case CUBLAS_STATUS_ALLOC_FAILED:
-        return "CUBLAS_STATUS_ALLOC_FAILED";
-
-    case CUBLAS_STATUS_INVALID_VALUE:
-        return "CUBLAS_STATUS_INVALID_VALUE";
-
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-    case CUBLAS_STATUS_MAPPING_ERROR:
-        return "CUBLAS_STATUS_MAPPING_ERROR";
-
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "CUBLAS_STATUS_INTERNAL_ERROR";
-    }
-
-    return "<unknown>";
-}
-#endif
 };     // namespace CudaTools
 #endif // CUDATOOLS_IMPLEMENTATION
 
diff --git a/Macros.h b/Macros.h
index 1586404..abf3e42 100644
--- a/Macros.h
+++ b/Macros.h
@@ -9,9 +9,6 @@
 #define CUDACC
 #endif
 
-using real32 = float;  /**< Type alias for 32-bit floating point datatype. */
-using real64 = double; /**< Type alias for 64-bit floating point datatype. */
-
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
 #define DEVICE
 #endif
@@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #ifdef CUDACC
 
 #include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#include <cusparse.h>
 
+#define DEVICE_FUNC __device__
 #define HD __host__ __device__
 #define SHARED __shared__
 
 #define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
 
 #else
+#define DEVICE_FUNC
 #define HD
 #define SHARED
 
@@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 
 #endif // CUDACC
 
-//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
-
 ///////////////////
 // DEVICE MACROS //
 ///////////////////
@@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
     do {                                                                                           \
         cublasStatus_t err = (call);                                                               \
         if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
-            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__,                                  \
-                   CudaTools::cublasGetErrorString(err));                                          \
+            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err));       \
+            throw std::exception();                                                                \
+        }                                                                                          \
+    } while (0)
+
+#define CUSPARSE_CHECK(call)                                                                       \
+    do {                                                                                           \
+        cusparseStatus_t err = (call);                                                             \
+        if (err != CUSPARSE_STATUS_SUCCESS) {                                                      \
+            printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err));    \
             throw std::exception();                                                                \
         }                                                                                          \
     } while (0)
diff --git a/Makefile b/Makefile
index 3a3b631..8bf16d1 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ INCLUDE :=
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
-LIBS_GPU := cuda cudart cublas
+LIBS_GPU := cuda cudart cublas cusparse
 
 TARGET = tests
 SRC_DIR = .
diff --git a/Makefile.template b/Makefile.template
index 48f72f8..83fc049 100644
--- a/Makefile.template
+++ b/Makefile.template
@@ -7,7 +7,7 @@ INCLUDE := <<Put extra include directories here, separated by a space>>
 LIBS_DIR := <<Put library directories here, separated by a space>>
 LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
 LIBS := <<Put the names of the libraries here, separated by a space>>
-LIBS_GPU := cuda cudart cublas <<Put extra GPU libraries here, separated by a space>>
+LIBS_GPU := cuda cudart cublas cusparse <<Put extra GPU libraries here, separated by a space>>
 
 TARGET = <<Put the name of your target here>>
 SRC_DIR = .
diff --git a/Sparse.h b/Sparse.h
new file mode 100644
index 0000000..ca0baf3
--- /dev/null
+++ b/Sparse.h
@@ -0,0 +1,10 @@
+#ifndef CUDATOOLS_SPARSE_H
+#define CUDATOOLS_SPARSE_H
+
+#include "Array.h"
+#include "Core.h"
+#include "Macros.h"
+#include "Types.h"
+#endif
+
+#endif
diff --git a/Complex.h b/Types.h
similarity index 67%
rename from Complex.h
rename to Types.h
index e01235f..c165b92 100644
--- a/Complex.h
+++ b/Types.h
@@ -11,6 +11,25 @@
 
 namespace CudaTools {
 
+namespace Types {
+
+using real32 = float;  /**< Type alias for 32-bit floating point datatype. */
+using real64 = double; /**< Type alias for 64-bit floating point datatype. */
+
+#ifdef CUDACC
+
+using real16 = __half;
+using realb16 = __nv_bfloat16;
+
+#else
+
+using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise,
+                         defaults to float. */
+using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise,
+                          defaults to float. */
+
+#endif // CUDACC
+
 template <typename T> class complex {
   private:
     T r = 0;
@@ -107,11 +126,10 @@ template complex<real64> operator*<real64>(const real64, const complex<real64>);
 template complex<real32> operator/<real32>(const real32, const complex<real32>);
 template complex<real64> operator/<real64>(const real64, const complex<real64>);
 
-}; // namespace CudaTools
+#ifdef CUDACC
+using complex64 = complex<real32>;
+using complex128 = complex<real64>;
 
-#ifdef CUDA
-using complex64 = CudaTools::complex<real32>;
-using complex128 = CudaTools::complex<real64>;
 #else
 using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
                                          * This adapts depending on the CUDA compilation flag, and
@@ -122,4 +140,27 @@ using complex128 =
                            * CudaTools::complex<real64>. */
 #endif
 
+/** Type alises and lots of metaprogramming definitions, primarily dealing with
+ * the different numeric types and overrides. */
+
+template <typename T> struct ComplexUnderlying_S { typedef T type; };
+template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
+template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
+template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
+
+template <typename T> struct ComplexConversion_S { typedef T type; };
+template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
+template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
+template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
+
+template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
+template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
+template <typename T>
+inline constexpr bool is_complex =
+    std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
+template <typename T> inline constexpr bool is_host_num = is_int<T> or is_float<T> or is_complex<T>;
+
+}; // namespace Types
+}; // namespace CudaTools
+
 #endif
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 7d8b0e7..b2c38db 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -21,7 +21,7 @@ for your own project, after following a few rules.
 
 The usage of this libary will be illustrated through examples, and further details
 can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
-Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
+Throughout this documentation, there are a few common terms that may appear. First, we refer to the CPU as the host, and the GPU as the device. So, a host function refers
 to a function runnable on the CPU, and a device function refers to a function that is runnable
 on a device. A kernel is a specific function that the host can call to be run on the device.
 
@@ -42,17 +42,17 @@ macros provided. For example,
         return 0;
     }
 
-The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
+The ``KERNEL(name, ...)`` macro takes in the function name and its arguments.
 The second argument in the ``KERNEL()`` macro is are the launch parameters for
 kernel. The launch parameters have several items, but for 'embarassingly parallel'
-cases, we can simply generate the settings with the number of threads. More detail with
+cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with
 creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
 there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
 see :ref:`here <Macro Functions>`.
 
 .. warning::
    These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
-   for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
+   for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it
    and make it available to other files.
 
 Since many applications used classes, a macro is provided to 'convert' a class into
@@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com
 mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
 
 .. warning::
-   Notice that a view must be passed to the kernel, and not the original object. This
+   Notice that a view must be passed to the kernel, and not the original object, otherwise a copy
+   would be made.
 
 The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
 a few other functions.
diff --git a/samples/5_SimpleGraph/main.cu.cpp b/samples/5_SimpleGraph/main.cu.cpp
index f9b2afb..a457d84 100644
--- a/samples/5_SimpleGraph/main.cu.cpp
+++ b/samples/5_SimpleGraph/main.cu.cpp
@@ -90,6 +90,7 @@ int main() {
     CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
     CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
 
+    // Executes process without graph.
     TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
 
     std::cout << A.slice({{0, 10}}) << "\n";
@@ -97,6 +98,7 @@ int main() {
     A.setConstant(50);
     B.setConstant(0);
 
+    // Executes process with graph.
     CudaTools::GraphManager gm;
     CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
     TIME(graph.execute().wait(), ExecuteGraph);
diff --git a/tests.cu.cpp b/tests.cu.cpp
index c224bd6..46a75b4 100644
--- a/tests.cu.cpp
+++ b/tests.cu.cpp
@@ -2,13 +2,14 @@
 #define CUDATOOLS_ARRAY_MAX_AXES 8
 #include "Array.h"
 #include "BLAS.h"
-#include "Complex.h"
 #include "Core.h"
+#include "Types.h"
 
 #include <Eigen/Core>
 #include <chrono>
 #include <complex>
 
+using namespace CudaTools::Types;
 namespace CT = CudaTools;
 
 /////////////