Initial commit with first version of library

3 years ago · b4e4a49d44
commit b4e4a49d44
26 changed files with 6120 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,10 @@
+---
+BasedOnStyle: LLVM
+IndentWidth: 4
+ColumnLimit: 100
+AllowShortIfStatementsOnASingleLine: true
+---
+Language: Cpp
+DerivePointerAlignment: false
+PointerAlignment: Left
+---
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+build
+*CPU
+*GPU
+.venv
--- a/Array.h
+++ b/Array.h
@ -0,0 +1,777 @@
+#ifndef ARRAY_H
+#define ARRAY_H
+
+#include "Core.h"
+#include "Macros.h"
+#include <Eigen/Dense>
+#include <iomanip>
+#include <math.h>
+#include <random>
+#include <type_traits>
+
+#ifdef DEVICE
+#define POINTER pDevice
+#else
+#define POINTER pHost
+#endif
+
+namespace CudaTools {
+
+template <typename T>
+using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
+template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
+template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;
+
+template <typename T> struct EigenAdaptConst { typedef EigenMapMat<T> type; };
+template <typename T> struct EigenAdaptConst<const T> { typedef ConstEigenMapMat<T> type; };
+
+#define ENABLE_IF(X) std::enable_if_t<X, bool>
+#define IS_INT(T) std::is_integral<T>::value
+#define IS_FLOAT(T) std::is_floating_point<T>::value
+#define IS_NUM(T) IS_INT(T) or IS_FLOAT(T)
+
+template <typename T> class Array;
+using Slice = std::pair<uint32_t, uint32_t>;
+
+template <typename T> class ArrayIterator {
+  private:
+    template <typename U>
+    friend std::ostream& operator<<(std::ostream& out, const ArrayIterator<U>& it);
+    T* pData;
+    Shape mShape;
+    uint32_t mIndices[CUDATOOLS_ARRAY_MAX_AXES] = {0};
+
+  public:
+    HD ArrayIterator(T* p, const Shape& shape) : pData(p), mShape(shape){};
+
+    /**
+     * Moves the iterator to the next value.
+     */
+    HD void next() {
+        bool carry = false;
+        uint32_t offset = 0;
+        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) {
+            if (mIndices[iAxis] == mShape.dim(iAxis) - 1) {
+                mIndices[iAxis] = 0;
+                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1);
+                carry = true;
+            } else {
+                pData += mShape.stride(iAxis);
+                mIndices[iAxis] += 1;
+                carry = false;
+            }
+
+            if (not carry) {
+                pData -= offset;
+                return;
+            }
+        }
+        pData += 1; // "Overflow" occured, so we reached end of array.
+    }
+
+    /**
+     * Moves the iterator to the previous value.
+     */
+    HD void prev() {
+        bool carry = false;
+        uint32_t offset = 0;
+        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) {
+            if (mIndices[iAxis] == 0) {
+                mIndices[iAxis] = mShape.dim(iAxis) - 1;
+                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1);
+                carry = true;
+            } else {
+                pData -= mShape.stride(iAxis);
+                mIndices[iAxis] += 1;
+                carry = false;
+            }
+            if (not carry) {
+                pData += offset;
+                return;
+            }
+        }
+        pData -= 1;
+    }
+
+    /**
+     * Moves the iterator a specified value away.
+     * \param amount the amount to advance by
+     */
+    HD void advance(const int32_t amount) {
+        if (amount < 0) {
+            for (uint32_t i = 0; i < abs(amount); ++i) {
+                prev();
+            }
+        } else {
+            for (uint32_t i = 0; i < abs(amount); ++i) {
+                next();
+            }
+        }
+    }
+
+    HD void operator++() { next(); }; /**< Prefix increment operator. */
+    HD void operator--() { prev(); }; /**< Prefix decrement operator. */
+
+    /**< Addition operator. */
+    HD ArrayIterator<T> operator+(const int32_t v) const {
+        ArrayIterator<T> it = *this;
+        it.advance(v);
+        return it;
+    };
+
+    /** Subtraction operator.*/
+    HD ArrayIterator<T> operator-(const int32_t v) const {
+        ArrayIterator<T> it = *this;
+        it.advance(-v);
+        return it;
+    };
+    HD void operator+=(const int32_t v) { advance(v); };
+    HD void operator-=(const int32_t v) { advance(-v); };
+
+    HD T& operator*() { return *pData; };             /**< Dereference operator. */
+    HD const T& operator*() const { return *pData; }; /**< Const dereference operator. */
+
+    /**
+     * Equals operator.
+     */
+    HD bool operator==(const ArrayIterator<T>& it) { return pData == it.pData; }
+
+    /**
+     * Not equals operator.
+     */
+    HD bool operator!=(const ArrayIterator<T>& it) { return pData != it.pData; }
+};
+
+template <typename T> std::ostream& operator<<(std::ostream& out, const ArrayIterator<T>& it) {
+    return out << it.pData;
+}
+
+template <typename T> class ArrayLoader {
+  private:
+    ArrayIterator<T> mIterator;
+    ArrayIterator<T> mIteratorEnd;
+
+  public:
+    HD ArrayLoader(const ArrayIterator<T>& it, const ArrayIterator<T>& it_end)
+        : mIterator(it), mIteratorEnd(it_end){};
+    HD ArrayLoader &operator,(const T value) {
+        CT_ERROR_IF(mIterator, ==, mIteratorEnd, "Cannot assign more values than Array size");
+        *mIterator = value;
+        ++mIterator;
+        return *this;
+    }
+};
+
+/**
+ * A container that holds a N-dimensional array, stored column major. To set the
+ * maximum N, there is a compiler macro CUDATOOLS_ARRAY_MAX_DIM whose default value is 4.
+ * It adapts to operations between host and device to ease memory management.
+ */
+template <typename T> class Array {
+  private:
+    template <typename U> friend std::ostream& operator<<(std::ostream&, const Array<U>&);
+
+    Shape mShape;
+    T* pHost = nullptr;
+    T* pDevice = nullptr;
+
+    bool mIsView = false;
+    bool mIsSlice = false;
+
+    uint32_t mEndOffset = 0;
+
+    void freeArrays() {
+#ifndef DEVICE
+        if (not mIsView) {
+            if (pDevice != nullptr) CudaTools::free(pDevice);
+            if (pHost != nullptr) delete[] pHost;
+        }
+#endif
+    };
+
+    HD void calcEnd() {
+        uint32_t offset = 0;
+        for (uint32_t i = 0; i < shape().axes(); ++i) {
+            offset += (shape().dim(i) - 1) * shape().stride(i);
+        }
+        mEndOffset = offset + 1;
+    };
+
+  public:
+    HD Array() = default;
+
+    /**
+     * Constructor for an Array that creates an allocates an array with
+     * the specified Shape. Construction in this format is disabled on the device.
+     * \brief Host only
+     * \param shape the shape of the array
+     * \param noDevice whether to initialize the array on the device
+     */
+    Array(const Shape& shape, const bool noDevice = false) : mShape(shape), mIsView(false) {
+        pHost = new T[shape.items()];
+        calcEnd();
+        if (noDevice) return;
+        pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T));
+    };
+
+    /**
+     * Constructor for an Array from an existing (preallocated) pointer.
+     * \param pointer the pointer to use
+     * \param shape the shape of the array
+     * \param noDevice whether to initialize the array on the device
+     */
+    HD Array(T* const pointer, const Shape& shape, const bool noDevice = false)
+        : mShape(shape), mIsView(true), mIsSlice(false) {
+        POINTER = pointer;
+        calcEnd();
+#ifndef DEVICE
+        if (noDevice) return;
+        pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T));
+#endif
+    };
+
+    /**
+     * Constructor for making a Array view from another Array,
+     * given an offset and shape.
+     * \param arr the original Array
+     * \param shape the shape of the new array
+     * \param offset the index where to start the a view of the array
+     */
+    HD Array(const Array& arr, const Shape& shape, const uint32_t offset = 0)
+        : mShape(shape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(true),
+          mIsSlice(arr.mIsSlice) {
+        calcEnd();
+        if (pHost != nullptr) pHost += offset;
+        if (pDevice != nullptr) pDevice += offset;
+    };
+
+    /**
+     * The copy-constructor for a Array. If this is not a view, a deep copy
+     * of the data will be performed on both host and device. On the device, it is always
+     * treated like a view.
+     */
+    HD Array(const Array& arr) : mShape(arr.mShape), mIsView(arr.mIsView), mIsSlice(arr.mIsSlice) {
+        calcEnd();
+        if (mIsView) { // If the other array was a view (and now this one), just assign.
+            pHost = arr.pHost;
+            pDevice = arr.pDevice;
+            return;
+        }
+
+        // Otherwise, we assume this is needs to own data.
+        pHost = new T[mShape.items()];
+        auto arr_it = arr.begin();
+        for (auto it = begin(); it != end(); ++it) {
+            *it = *arr_it;
+            ++arr_it;
+        }
+
+#ifndef DEVICE
+        if (arr.pDevice != nullptr) {
+            pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T));
+        }
+#endif
+    };
+
+    /**
+     *  The move-constructor for a Array.
+     */
+    HD Array(Array&& arr)
+        : mShape(arr.mShape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(arr.mIsView),
+          mIsSlice(arr.mIsSlice) {
+        calcEnd();
+        // Make other object empty.
+        arr.pHost = nullptr;
+        arr.pDevice = nullptr;
+        arr.mIsView = true;
+    };
+
+    HD ~Array() { freeArrays(); };
+
+    /**
+     * The copy-assignment operator for a Array. If this is not a view,
+     * then the currently owned data will be freed, and a deep copy of the data will
+     * be performed on both host and device. On the device, it is always treated like a view.
+     */
+    HD Array& operator=(const Array& arr) {
+        if (this == &arr) return *this;
+
+        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
+            auto arr_it = arr.begin();
+            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) {
+                *it = *arr_it;
+                ++arr_it;
+            }
+            return *this;
+        }
+
+        // Otherwise, it is implied to be object reassignment.
+        mShape = arr.mShape;
+        mIsView = arr.mIsView;
+        mIsSlice = arr.mIsSlice;
+        calcEnd();
+
+        // Regardless if the right-hand side is a view, we create a new copy.
+        // In case that the right-hand side is a view of this array, we
+        // allocate memory to copy first. Keep in mind that the right-hand side
+        // array will then become undefined.
+
+        // We can only do this on the host.
+#ifndef DEVICE
+        T* new_pDevice = nullptr;
+        if (pDevice != nullptr) {
+            new_pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T));
+        }
+
+        T* new_pHost = new T[mShape.items()];
+        memcpy(new_pHost, arr.pHost, mShape.items() * sizeof(T));
+
+        freeArrays();
+        pHost = new_pHost;
+        pDevice = new_pDevice;
+#else
+        pHost = arr.pHost;
+        pDevice = arr.pDevice;
+#endif
+        return *this;
+    };
+
+    /**
+     * The move-assignment operator for a Array.
+     */
+    HD Array& operator=(Array&& arr) {
+        if (this == &arr) return *this;
+
+        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
+            auto arr_it = arr.begin();
+            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) {
+                *it = *arr_it;
+                ++arr_it;
+            }
+            return *this;
+        }
+
+        CT_ERROR(arr.mIsView,
+                 "Cannot move-assign view to a non-view (owner). This would lead to undefined "
+                 "behavior.");
+
+        // Otherwise, it is implied to be object reassignment.
+        freeArrays();
+        mShape = arr.mShape;
+        pHost = arr.pHost;
+        pDevice = arr.pDevice;
+        mIsView = arr.mIsView;
+        mIsSlice = arr.mIsSlice;
+        calcEnd();
+
+        // Make other array empty.
+        arr.pHost = nullptr;
+        arr.pDevice = nullptr;
+        arr.mIsView = true;
+        return *this;
+    };
+
+    /**
+     * Used for indexing the Array.
+     * \param index index of the first dimension
+     */
+    HD Array operator[](const uint32_t index) const {
+        CT_ERROR_IF(index, >=, shape().dim(0), "Index exceeds axis size");
+        return Array(*this, shape().subshape(1), index * shape().stride(0));
+    };
+
+    /**
+     * Used for indexing the Array.
+     * \param indices a list of indices to index the Array
+     */
+    HD Array operator[](const std::initializer_list<uint32_t> indices) const {
+        CT_ERROR_IF(indices.size(), >, shape().axes(),
+                    "Number of indices cannot exceed number of axes");
+        auto it = indices.begin();
+        uint offset = 0;
+        for (uint32_t i = 0; i < indices.size(); ++i) {
+            uint32_t index = *it;
+            CT_ERROR_IF(index, >=, shape().dim(i), "Index exceeds axis size");
+            offset += index * shape().stride(i);
+            ++it;
+        }
+        return Array(*this, shape().subshape(indices.size()), offset);
+    };
+
+    HD ArrayLoader<T> operator<<(const T value) {
+        auto it = begin();
+        *it = value;
+        ++it;
+        return ArrayLoader<T>(it, end());
+    };
+
+    HD T operator=(const T& value) { return POINTER[0] = value; };
+    HD operator T&() { return POINTER[0]; };
+    HD operator const T&() const { return POINTER[0]; };
+
+    /**
+     * Used to create slices of the Array.
+     * \param slices a list of slices to slice the Array
+     */
+    HD Array slice(const std::initializer_list<Slice> slices) const {
+        CT_ERROR_IF(slices.size(), >, shape().axes(),
+                    "Number of slices cannot exceed number of axes");
+
+        uint offset = 0;
+        Shape new_shape = mShape;
+        auto it = slices.begin();
+        for (uint32_t i = 0; i < slices.size(); ++i) {
+            uint32_t from_index = it->first;
+            uint32_t to_index = it->second;
+            CT_ERROR_IF(from_index, >, to_index,
+                        "Slice start cannot be greater than than slice end");
+            CT_ERROR_IF(from_index, >=, shape().dim(i), "Slice start exceeds axis size");
+            CT_ERROR_IF(to_index - 1, >=, shape().dim(i), "Slice end exceeds axis size");
+
+            offset += from_index * shape().stride(i);
+            new_shape.mAxisDim[i] = to_index - from_index;
+            ++it;
+        }
+        new_shape.mItems = 1;
+        for (uint32_t i = 0; i < shape().axes(); ++i) {
+            new_shape.mItems *= new_shape.dim(i);
+        }
+
+        Array<T> arr(*this, new_shape, offset);
+        arr.mIsSlice = true;
+        return arr;
+    };
+
+    /**
+     * Returns this Array with a different Shape. Its self assigning version is reshape.
+     * If this Array is a slice of another, then it will perform a deep copy, and return
+     * a new non-view array.
+     */
+    HD Array reshaped(const Shape& new_shape) const {
+        CT_ERROR_IF(shape().items(), !=, new_shape.items(),
+                    "New shape cannot have a different number of terms");
+        if (mIsSlice) {
+            Array<T> arr = this->copy();
+            return arr.reshaped(new_shape);
+        }
+        Array<T> arr = view();
+        arr.mShape = new_shape;
+        return arr;
+    };
+
+    HD void reshape(const Shape& new_shape) {
+        CT_ERROR_IF(shape().items(), !=, new_shape.items(),
+                    "New shape cannot have a different number of terms");
+        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try reshaped instead)")
+        mShape = new_shape;
+    };
+
+    /**
+     * Gets a view that is has at least two dimensions. Useful for promoting
+     * single vectors to their 2D counterparts.
+     */
+    HD Array atLeast2D() const {
+        return (shape().axes() == 1) ? Array(*this, {shape().length(), 1}) : view();
+    };
+
+    /**
+     * Flattens the Array into one dimension.
+     */
+    HD Array flatten() const { return reshape({mShape.mItems}); };
+
+    /**
+     * Returns the Eigen::Map of this Array.
+     */
+    typename EigenAdaptConst<T>::type eigenMap() const {
+        uint32_t total_dim = mShape.mAxes;
+        CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices")
+        CT_ERROR_IF(total_dim, !=, 2,
+                    "Mapping to an Eigen array can only occur on two-dimensional arrays");
+        return typename EigenAdaptConst<T>::type(POINTER, mShape.rows(), mShape.cols());
+    };
+
+    /**
+     * Gets the Shape of the Array.
+     */
+    HD Shape shape() const { return mShape; };
+
+    /**
+     * Gets the pointer to this array, depending on host or device.
+     */
+    HD T* data() const { return POINTER; };
+
+    /**
+     * Returns the device pointer regardless of host or device.
+     */
+    HD T* dataDevice() const { return pDevice; };
+
+    HD bool isView() const { return mIsView; };   /**< Gets whether this Array is a view. */
+    HD bool isSlice() const { return mIsSlice; }; /**< Gets whether this Array is a slice. */
+
+    /**
+     * Gets a view of this Array.
+     */
+    HD Array view() const { return Array(*this, mShape); }
+
+    /**
+     * Copies this Array and returns a new Array with the same memory.
+     */
+    HD Array copy() const {
+        Array<T> arr(mShape, (pDevice == nullptr));
+
+        auto arr_it = arr.begin();
+        for (auto it = begin(); it != end(); ++it) {
+            *arr_it = *it;
+            ++arr_it;
+        }
+#ifndef DEVICE
+        if (pDevice != nullptr) {
+            CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
+        }
+#endif
+        return arr;
+    };
+
+    /**
+     * Gets the iterator to the beginning of this Array.
+     */
+    HD ArrayIterator<T> begin() const { return ArrayIterator<T>(POINTER, mShape); };
+
+    /**
+     * Gets the iterator to the end of this Array.
+     */
+    HD ArrayIterator<T> end() const { return ArrayIterator<T>(POINTER + mEndOffset, mShape); };
+
+    /**
+     * Sets the values of the entire Array to a constant. This is restricted to numerical types.
+     */
+    HD void setConstant(const T value) const {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        for (auto it = begin(); it != end(); ++it) {
+            *it = value;
+        }
+    };
+
+    /**
+     * Sets the Array values with uniform random values in a specified range. This is restricted to
+     * numerical types.
+     * \brief Host only
+     */
+    void setRandom(const T min, const T max) const {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
+        std::random_device rd;
+        std::mt19937 mt(rd());
+        if constexpr (IS_INT(T)) {
+            std::uniform_int_distribution<T> dist(min, max);
+            for (auto it = begin(); it != end(); ++it) {
+                *it = dist(mt);
+            }
+        } else if constexpr (IS_FLOAT(T)) {
+            std::uniform_real_distribution<T> dist(min, max);
+            for (auto it = begin(); it != end(); ++it) {
+                *it = dist(mt);
+            }
+        }
+    };
+
+    /**
+     * Sets the Array values to start from a value and increment by a specified step. This is
+     * restricted to numerical types.
+     */
+    HD void setRange(T min, const T step = 1) const {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        for (auto it = begin(); it != end(); ++it) {
+            *it = min;
+            min += step;
+        }
+    }
+    /**
+     * Sets the Array values to be evenly spaced numbers over a given interval. This is restricted
+     * to floating point types.
+     */
+    HD void setLinspace(const T min, const T max) const {
+        static_assert(IS_FLOAT(T), "Function only available on numeric floating types.");
+        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
+        T i = 0;
+        T d = max - min;
+        T items = (T)(shape().items() - 1);
+        for (auto it = begin(); it != end(); ++it) {
+            *it = min + d * (i / items);
+            i += 1;
+        }
+    };
+
+    /**
+     * Returns array of given shape with constant values. This is restricted to numerical types.
+     * \brief Host only
+     */
+    static Array constant(const Shape& shape, const T value) {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        Array<T> arr(shape);
+        arr.setConstant(value);
+        return arr;
+    };
+
+    /**
+     * Returns array of given shape with random values in given interval. This is restricted to
+     * numerical types.
+     * \brief Host only
+     */
+    static Array random(const Shape& shape, const T min, const T max) {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        Array<T> arr(shape);
+        arr.setRandom(min, max);
+        return arr;
+    };
+
+    /**
+     * Returns evenly spaced values within a given interval. This is restricted to numerical types.
+     * \brief Host only
+     */
+    static Array range(const T min, const T max, const T step = 1) {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
+        Array<T> arr({(uint32_t)((max - min) / step)});
+        arr.setRange(min, step);
+        return arr;
+    }
+
+    /**
+     * Returns evenly spaced values within a given interval. This is restricted to floating point
+     * types.
+     * \brief Host only
+     */
+    static Array linspace(const T min, const T max, const uint32_t size) {
+        static_assert(IS_FLOAT(T), "Function only available on numeric floating types.");
+        Array<T> arr({size});
+        arr.setLinspace(min, max);
+        return arr;
+    }
+
+    /**
+     * Transposes the internal data and returns the corresponding new Array.
+     * Its self assigning version is transpose. This is restricted to numerical types.
+     * \brief Host only
+     */
+    Array transposed() const {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
+        Array<T> new_arr({mShape.rows(), mShape.cols()});
+        new_arr.eigenMap() = this->eigenMap().transpose().eval();
+        return new_arr;
+    };
+
+    /**
+     * Transposes the intenal data. Its self assigning version is transpose.
+     * This is restricted to numerical types.
+     * \brief Host only
+     */
+    void transpose() {
+        static_assert(IS_NUM(T), "Function only available on numeric types.");
+        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
+        Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
+        new_arr.eigenMap() = this->eigenMap().transpose().eval();
+        mShape = Shape({mShape.cols(), mShape.rows()});
+    };
+
+    void inverse() const {
+        static_assert(IS_FLOAT(T), "Function only available on floating numeric types.");
+        CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays");
+        CT_ERROR_IF(shape().rows(), !=, shape().cols(),
+                    "Inverse can only occur on square matrices");
+        Array<T> inv(shape());
+        inv.eigenMap() = this->eigenMap().inverse();
+    };
+
+    /**
+     * Pins the memory (page locks) for faster memory transfer in concurrent
+     * transfers.
+     * \brief Host only
+     */
+    void pinMemory() const { CudaTools::pin(pHost, mShape.items() * sizeof(T)); };
+
+    /**
+     * Updates the host copy by copying the device data back to the host.
+     * \brief Host only
+     */
+    StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
+        CT_ERROR(mIsView, "Cannot update host on a view");
+        CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream);
+        return stream;
+    };
+
+    /**
+     * Updates the device copy by copying the host data to the device.
+     * \brief Host only
+     */
+    StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
+        CT_ERROR(mIsView, "Cannot update device on a view");
+        CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream);
+        return stream;
+    };
+};
+
+template <typename T>
+void printAxis(std::ostream& out, const Array<T>& arr, const uint32_t axis, size_t width) {
+    std::string space = std::string(2 * axis, ' ');
+    if (arr.shape().axes() == 1) {
+        out << "[";
+        for (uint32_t i = 0; i < arr.shape().items(); ++i) {
+            if constexpr (std::is_floating_point<T>::value) {
+                out << std::scientific << std::setprecision(6);
+            }
+            if (width == 0) {
+                out << ((i == 0) ? "" : " ");
+            } else {
+                out << std::setw((i == 0) ? width - 1 : width);
+            }
+            out << (T)arr[i] << ((i == arr.shape().items() - 1) ? "]" : ",");
+        }
+    } else if (arr.shape().axes() == 2) {
+        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) {
+            out << space << ((i == 0) ? "[" : " ");
+            printAxis(out, arr[i], axis + 1, width);
+            out << ((i == arr.shape().dim(0) - 1) ? "]" : ",\n");
+        }
+    } else {
+        out << space << "[\n";
+        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) {
+            printAxis(out, arr[i], axis + 1, width);
+            out << ((i == arr.shape().dim(0) - 1) ? "\n" : ",\n\n");
+        }
+        out << space << "]";
+    }
+}
+
+template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>& arr) {
+    size_t width = 0;
+    if constexpr (IS_NUM(T)) {
+        T max_val = 0;
+        bool negative = false;
+        for (auto it = arr.begin(); it != arr.end(); ++it) {
+            if (*it < 0) negative = true;
+            max_val = (abs(*it) > max_val) ? abs(*it) : max_val;
+        }
+        width = std::to_string(max_val).size() + 1;
+        width += (negative) ? 1 : 0;
+    } else if constexpr (IS_FLOAT(T)) {
+        T max_val = 0;
+        bool negative = false;
+        for (auto it = arr.begin(); it != arr.end(); ++it) {
+            if (*it < 0) negative = true;
+            int exp = 0;
+            frexp(*it, &exp);
+            max_val = (exp > max_val) ? exp : max_val;
+        }
+        width = std::to_string(max_val).size() + 5;
+        width += (negative) ? 1 : 0;
+    }
+
+    printAxis<T>(out, arr, 0, (arr.shape().axes() == 1) ? 0 : width);
+    return out;
+}
+
+}; // namespace CudaTools
+
+#endif // ARRAY_H
--- a/BLAS.h
+++ b/BLAS.h
@ -0,0 +1,600 @@
+#ifndef BLAS_H
+#define BLAS_H
+
+#include "Array.h"
+#include "Core.h"
+#include "Macros.h"
+
+namespace CudaTools {
+
+namespace BLAS {
+
+struct BatchInfo {
+    uint32_t strideA, strideB, strideC;
+    uint32_t size;
+};
+
+template <typename T> struct Check {
+    static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
+        CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
+    };
+
+    static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
+        isAtLeast2D(arr, name);
+        CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
+    };
+
+    static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+                              const std::string& nameA = "A", const std::string& nameB = "B",
+                              const std::string nameC = "C") {
+        isAtLeast2D(A, nameA);
+        isAtLeast2D(B, nameB);
+        isAtLeast2D(C, nameB);
+        CT_ERROR_IF(A.shape().cols(), !=, B.shape().rows(),
+                    (nameA + nameB + " is not a valid matrix multiplication").c_str());
+
+        Shape ABshape({A.shape().rows(), B.shape().cols()});
+        Shape Cshape({C.shape().rows(), C.shape().cols()});
+
+        CT_ERROR_IF(
+            ABshape, !=, Cshape,
+            ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
+    };
+
+    static uint32_t getUpperItems(const Array<T>& arr) {
+        uint32_t upperItems = 1;
+        for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
+            upperItems *= arr.shape().dim(iAxis);
+        }
+        return upperItems;
+    };
+
+    static void matchUpperShape(const Array<T>& A, const Array<T>& B,
+                                const std::string& nameA = "A", const std::string& nameB = "B") {
+        CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
+                    (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
+        for (uint32_t iAxis = 0; iAxis < A.shape().axes() - 2; ++iAxis) {
+            uint32_t Adim = A.shape().dim(iAxis);
+            uint32_t Bdim = B.shape().dim(iAxis);
+            CT_ERROR_IF(
+                Adim, !=, Bdim,
+                (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
+        }
+    };
+
+    static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
+                                     const std::string& nameA = "A", const std::string& nameB = "B",
+                                     const std::string nameC = "C") {
+        isValidMatmul(A, B, C, nameA, nameB, nameC);
+        uint32_t itemsA = getUpperItems(A);
+        uint32_t itemsB = getUpperItems(B);
+        uint32_t itemsC = getUpperItems(C);
+
+        uint32_t Asize = A.shape().rows() * A.shape().cols();
+        uint32_t Bsize = B.shape().rows() * B.shape().cols();
+        uint32_t Csize = C.shape().rows() * C.shape().cols();
+
+        if (itemsA == itemsB) {
+            CT_ERROR_IF(itemsA, !=, itemsC,
+                        ("Incorrect dimensions to broadcast to output " + nameC).c_str());
+            matchUpperShape(A, B, nameA, nameB);
+            matchUpperShape(A, C, nameA, nameC);
+            return BatchInfo{Asize, Bsize, Csize, itemsC};
+        } else if (itemsA > itemsB) {
+            CT_ERROR_IF(
+                itemsB, !=, 1,
+                ("Cannot broadcast operation to " + nameB + " with non-matching " + nameA).c_str());
+            CT_ERROR_IF(itemsA, !=, itemsC,
+                        ("Incorrect dimensions to broadcast to output " + nameC).c_str());
+            matchUpperShape(A, C, nameA, nameC);
+            return BatchInfo{Asize, 0, Csize, itemsC};
+        } else {
+            CT_ERROR_IF(
+                itemsA, !=, 1,
+                ("Cannot broadcast operation to " + nameA + " with non-matching " + nameB).c_str());
+            CT_ERROR_IF(itemsA, !=, itemsC,
+                        ("Incorrect dimensions to broadcast to output " + nameC).c_str());
+            matchUpperShape(B, C, nameB, nameC);
+            return BatchInfo{0, Bsize, Csize, itemsC};
+        }
+    };
+};
+
+/**
+ * Represents a Batch of Arrays with the same shape. Mainly used for cuBLAS functions.
+ */
+template <typename T> class Batch {
+  protected:
+    Array<T*> mBatch;
+    Shape mShape;
+
+    uint32_t mCount = 0;
+    uint32_t mBatchSize;
+
+  public:
+    Batch() = delete;
+
+    /**
+     * Constructs a batch from a given size.
+     */
+    Batch(const uint32_t size) : mBatchSize(size){};
+
+    /**
+     * Constructs a batch from a non-view Array.
+     */
+    Batch(const Array<T>& arr) {
+        CT_ERROR(arr.isView(), "Array cannot be a view");
+        mShape = Shape({arr.shape().rows(), arr.shape().cols()});
+        mBatchSize = mCount = Check<T>::getUpperItems(arr);
+
+        mBatch = Array<T*>({mBatchSize});
+
+        Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
+        for (uint32_t i = 0; i < mBatchSize; ++i) {
+#ifdef CUDA
+            mBatch[i] = batch[i].dataDevice();
+#else
+            mBatch[i] = batch[i].data();
+#endif
+        }
+
+        mBatch.updateDevice().wait();
+    };
+
+    /**
+     * Adds a matrix to the batch. Array must be a view.
+     */
+    void add(const Array<T>& arr) {
+        CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
+        CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
+#ifdef CUDA
+        mBatch[mCount] = arr.dataDevice();
+#else
+        mBatch[mCount] = arr.data();
+#endif
+        if (mCount == 0) {
+            mShape = arr.shape();
+            mBatchSize = mCount = Check<T>::getUpperItems(arr);
+        } else {
+            CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
+        }
+        ++mCount;
+
+        if (mCount == mBatchSize) {
+            mBatch.updateDevice().wait();
+        }
+    };
+
+    /**
+     * Indexing operator which returns a view of the Array in the Batch at the given index.
+     */
+    Array<T> operator[](const uint32_t index) const {
+        CT_ERROR_IF(index, >=, mBatchSize, "Index exceeds batch size");
+        return Array<T>(mBatch[index], {mShape.rows(), mShape.cols()});
+    };
+
+    /**
+     * Returns the batch Array of pointers.
+     */
+    Array<T*> batch() const { return mBatch.view(); };
+    Shape shape() const { return mShape; } /**< Gets the shape of the matrices in the batch. */
+    uint32_t size() const { return mBatchSize; }        /**< Gets the batch size.*/
+    bool full() const { return mBatchSize == mCount; }; /**< Gets if the batch is full. */
+};
+
+////////////////
+// cuBLAS API //
+////////////////
+
+template <typename T, typename F1, typename F2, typename... Args>
+constexpr void invoke(F1 f1, F2 f2, Args&&... args) {
+    if constexpr (std::is_same<T, float>::value) {
+        CUBLAS_CHECK(f1(args...));
+    } else if constexpr (std::is_same<T, double>::value) {
+        CUBLAS_CHECK(f2(args...));
+    } else {
+        CT_ERROR(true, "BLAS functions are not callable with that type");
+    }
+}
+
+/**
+ * Computes the matrix-vector product: \f$ y = \alpha Ax + \beta y \f$. It will automatically
+ * broadcast the operation if applicable.
+ */
+template <typename T>
+StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
+              const StreamID& stream = DEF_CUBLAS_STREAM) {
+
+    BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
+    CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
+    CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
+
+    uint32_t rows = A.shape().rows();
+    uint32_t cols = A.shape().cols();
+    T a = alpha, b = beta;
+#ifdef CUDA
+    CUBLAS_CHECK(
+        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
+    if (bi.size == 1) {
+        invoke<T>(cublasSgemv, cublasDgemv, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols,
+                  &a, A.dataDevice(), rows, x.dataDevice(), 1, &b, y.dataDevice(), 1);
+
+    } else { // Greater than 2, so broadcast.
+        invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched,
+                  Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, &a, A.dataDevice(), rows,
+                  bi.strideA, x.dataDevice(), 1, bi.strideB, &b, y.dataDevice(), 1, bi.strideC,
+                  bi.size);
+    }
+
+#else
+    if (bi.size == 1) {
+        y.eigenMap() = a * (A.eigenMap() * x.eigenMap()) + b * y.eigenMap();
+    } else { // Greater than 2, so broadcast.
+#pragma omp parallel for
+        for (uint32_t i = 0; i < bi.size; ++i) {
+            auto Ai = Array<T>(A, {rows, cols}, i * bi.strideA).eigenMap();
+            auto xi = Array<T>(x, {cols, 1}, i * bi.strideB).eigenMap();
+            auto yi = Array<T>(y, {rows, 1}, i * bi.strideC).eigenMap();
+            yi = a * (Ai * xi) + b * yi;
+        }
+    }
+#endif
+    return StreamID{stream};
+}
+
+/**
+ * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
+ * broadcast the operation if applicable.
+ */
+template <typename T>
+StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
+              const StreamID& stream = DEF_CUBLAS_STREAM) {
+
+    BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
+    // A is m x k, B is k x n.
+    uint32_t m = A.shape().rows();
+    uint32_t k = A.shape().cols();
+    uint32_t n = B.shape().cols();
+
+    T a = alpha, b = beta;
+#ifdef CUDA
+    CUBLAS_CHECK(
+        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
+    if (bi.size == 1) {
+        invoke<T>(cublasSgemm, cublasDgemm, Manager::get()->cublasHandle(), CUBLAS_OP_N,
+                  CUBLAS_OP_N, m, n, k, &a, A.dataDevice(), m, B.dataDevice(), k, &b,
+                  C.dataDevice(), m);
+
+    } else { // Greater than 2, so broadcast.
+        invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched,
+                  Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &a,
+                  A.dataDevice(), m, bi.strideA, B.dataDevice(), k, bi.strideB, &b, C.dataDevice(),
+                  m, bi.strideC, bi.size);
+    }
+
+#else
+    if (bi.size == 1) {
+        C.eigenMap() = a * (A.eigenMap() * B.eigenMap()) + b * C.eigenMap();
+    } else { // Greater than 2, so broadcast.
+#pragma omp parallel for
+        for (uint32_t i = 0; i < bi.size; ++i) {
+            auto Ai = Array<T>(A, {m, k}, i * bi.strideA).eigenMap();
+            auto Bi = Array<T>(B, {k, n}, i * bi.strideB).eigenMap();
+            auto Ci = Array<T>(C, {m, n}, i * bi.strideC).eigenMap();
+            Ci = a * (Ai * Bi) + b * Ci;
+        }
+    }
+#endif
+    return StreamID{stream};
+}
+
+/**
+ * Computes the diagonal matrix multiplication: \f$ C = A\mathrm{diag}(X) \f$, or \f$ C =
+ * \mathrm{diag}(X)A \f$ if left = true.
+ */
+template <typename T>
+StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const bool left = false,
+              const StreamID& stream = DEF_CUBLAS_STREAM) {
+    CT_ERROR_IF(X.shape().cols(), !=, 1, "'x' must be a column vector.");
+    if (left) {
+        CT_ERROR_IF(A.shape().rows(), !=, X.shape().rows(),
+                    "Rows of 'A' and length of 'x' need to match.");
+    } else {
+        CT_ERROR_IF(A.shape().cols(), !=, X.shape().rows(),
+                    "Columns of 'A' and length of 'x' need to match.");
+    }
+    CT_ERROR_IF(A.shape().rows(), !=, C.shape().rows(),
+                "Rows of 'A' and rows() of 'C' need to  match.");
+    CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
+                "Rows of 'A' and columns of 'C' need to match.");
+
+#ifdef CUDA
+    uint32_t m = C.shape().rows();
+    uint32_t n = C.shape().cols();
+    auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
+    CUBLAS_CHECK(
+        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
+    invoke<T>(cublasSdgmm, cublasDdgmm, Manager::get()->cublasHandle(), m, n, A.dataDevice(),
+              A.shape().rows(), X.dataDevice(), 1, C.dataDevice(), m);
+#else
+    if (left) {
+        C.eigenMap() = X.eigenMap().asDiagonal() * A.eigenMap();
+    } else {
+        C.eigenMap() = A.eigenMap() * X.eigenMap().asDiagonal();
+    }
+#endif
+    return StreamID{stream};
+}
+
+//////////////////////////////
+// PLUArray Related Objects //
+//////////////////////////////
+
+///////////////////////////
+// PartialPivLU Wrapper  //
+///////////////////////////
+
+// This class is just a workaround to use Eigen's internals directly.
+template <typename T> class PartialPivLU;
+namespace internal {
+template <typename T> static Array<T> empty({1, 1});
+template <typename T> static EigenMapMat<T> empty_map = empty<T>.eigenMap();
+}; // namespace internal
+
+template <typename T, ENABLE_IF(IS_FLOAT(T)) = true> class PLUArray;
+// This is a wrapper class for Eigen's class so we have more controlled access to
+// the underlying data.
+template <typename T> class PartialPivLU : public Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>> {
+  private:
+    using Base = Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>>;
+    template <typename U, ENABLE_IF(IS_FLOAT(U))> friend class PLUArray;
+
+    EigenMapMat<T> mMapLU;
+    EigenMapMat<int32_t> mMapPivots;
+
+  public:
+    PartialPivLU()
+        : Base(internal::empty_map<T>), mMapLU(internal::empty_map<T>),
+          mMapPivots(internal::empty_map<int32_t>){};
+
+    void make(const Array<T>& lu, const Array<int32_t>& pivots) {
+
+        new (&mMapLU) EigenMapMat<T>(lu.eigenMap());
+        new (&mMapPivots) EigenMapMat<int32_t>(pivots.atLeast2D().eigenMap());
+
+        new (&this->m_lu) decltype(Base::m_lu)(mMapLU.derived());
+        new (&this->m_p) decltype(Base::m_p)(mMapPivots.derived());
+
+        // new (&this->m_rowsTranspositions) decltype(Base::m_rowsTranspositions)(
+        //     mMapPivots.derived());
+
+        this->m_l1_norm = 0;
+        this->m_det_p = 0;
+        this->m_isInitialized = true;
+    };
+};
+
+namespace internal {
+// We only create one and copy-construct to avoid the re-initialization.
+template <typename T> static PartialPivLU<T> BlankPPLU = PartialPivLU<T>();
+}; // namespace internal
+
+/**
+ * Class for storing the PLU decomposition an Array. This is restricted to floating point types.
+ */
+template <typename T, ENABLE_IF(IS_FLOAT(T))> class PLUArray {
+  private:
+    Array<T> mLU;
+    Array<int32_t> mPivots;
+    PartialPivLU<T> mPPLU = internal::BlankPPLU<T>;
+
+  public:
+    PLUArray() = delete;
+
+    /**
+     * Constructor for a PLUArray given the matrix dimension.
+     */
+    PLUArray(const uint32_t n) : mLU({n, n}), mPivots({n}) { mPPLU.make(mLU, mPivots); };
+
+    /**
+     * Constructor for a PLUArray given an existing array.
+     */
+    PLUArray(const Array<T>& arr)
+        : mLU((arr.isView()) ? arr.view() : arr), mPivots({arr.shape().rows()}) {
+        CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix");
+        CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square");
+        mPPLU.make(mLU, mPivots);
+    };
+
+    /**
+     * Constructor for a PLUArray given an existing location in memory for both the matrix and
+     * the pivots.
+     */
+    PLUArray(const Array<T>& arr, const Array<int32_t> pivots)
+        : mLU(arr.view()), mPivots(pivots.view()) {
+        CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix");
+        CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square");
+        mPPLU.make(mLU, mPivots);
+    };
+
+    uint32_t rank() { return mLU.shape().rows(); }; /**< Gets the rank of the LU matrix. */
+    Array<T> LU() const { return mLU.view(); };     /**< Gets the LU matrix. */
+    Array<int32_t> pivots() const { return mPivots.view(); }; /**< Gets the LU matrix. */
+
+    /**
+     * Comptues the inplace LU factorization for this array on CPU.
+     */
+    void computeLU() {
+        mPPLU.compute();
+        mPPLU.mMapPivots = mPPLU.permutationP().indices();
+    };
+
+    /**
+     * Solves the system \f$ LUx = b \f$ and returns \f$x\f$.
+     */
+    Array<T> solve(const Array<T>& b) {
+        Array<T> x(b.shape());
+        x.eigenMap() = mPPLU.solve(b.eigenMap());
+        return x;
+    };
+};
+
+/**
+ * This is a batch version of PLUArray, to enable usage of the cuBLAS API. This is restricted to
+ * floating point types.
+ */
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+class PLUBatch : public Batch<T> {
+  private:
+    Array<int32_t> mPivotsBatch;
+    Array<int32_t> mInfoLU;
+    int32_t mInfoSolve;
+
+    bool mInitialized = false;
+
+  public:
+    /**
+     * Constructor of a PLUBatch from a given batch size.
+     */
+    PLUBatch(const uint32_t size) : Batch<T>(size), mInfoLU({size}){};
+
+    /**
+     * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
+     */
+    PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
+        Check<T>::isSquare(arr, "LU Array");
+
+        mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
+        mInfoLU = Array<int32_t>({this->mBatchSize});
+    };
+
+    /**
+     * Indexing operator which returns the PLUArray in the PLUBatch at the given index.
+     */
+    PLUArray<T> operator[](const uint32_t index) const {
+        CT_ERROR_IF(index, >=, this->mBatchSize, "Index exceeds batch size");
+        Array<T> lu(this->mBatch[index], {this->mShape.rows(), this->mShape.cols()});
+        Array<int32_t> pivots(mPivotsBatch.data() + index * this->mShape.rows(),
+                              {this->mShape.rows()});
+        return PLUArray<T>(lu, pivots);
+    };
+
+    /**
+     * Computes the inplace PLU decomposition of batch of arrays.
+     */
+    StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
+#ifdef CUDA
+        uint32_t n = this->mShape.rows();
+        CUBLAS_CHECK(
+            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
+        invoke<T>(cublasSgetrfBatched, cublasDgetrfBatched, Manager::get()->cublasHandle(), n,
+                  this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), mInfoLU.dataDevice(),
+                  this->mBatchSize);
+
+#else
+#pragma omp parallel for
+        for (uint32_t i = 0; i < this->mBatchSize; ++i) {
+            (*this)[i].computeLU();
+        }
+#endif
+        mInitialized = true;
+        return stream;
+    };
+
+    /**
+     * Solves the batched system \f$LUx = b\f$ inplace. The solution \f$x\f$ is written back into
+     * \f$b\f$.
+     */
+    StreamID solve(const Batch<T>& b, const StreamID& stream = DEF_CUBLAS_STREAM) {
+        CT_ERROR(not mInitialized,
+                 "Cannot solve system if PLUBatch has not yet computed its LU decomposition");
+        CT_ERROR_IF(b.size(), !=, this->mBatchSize,
+                    "Upper dimensions of b do not match batch size");
+        CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
+                    "The length of each column of b must match the matrix rank");
+
+#ifdef CUDA
+        uint32_t n = b.shape().rows();
+        uint32_t nrhs = b.shape().cols();
+        CUBLAS_CHECK(
+            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
+        invoke<T>(cublasSgetrsBatched, cublasDgetrsBatched, Manager::get()->cublasHandle(),
+                  CUBLAS_OP_N, n, nrhs, this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(),
+                  b.batch().dataDevice(), n, &mInfoSolve, this->mBatchSize);
+
+#else
+#pragma omp parallel for
+        for (uint32_t i = 0; i < this->mBatchSize; ++i) {
+            b[i] = (*this)[i].solve(b[i]);
+        }
+#endif
+        return stream;
+    };
+
+    /**
+     * Gets the pivots data from the device to the host. Does nothing for CPU.
+     */
+    StreamID getPivots(const StreamID& stream = DEF_MEM_STREAM) const {
+        mPivotsBatch.updateHost(stream);
+        return stream;
+    };
+
+    /**
+     * Gets the info array for the LU decomposition for the device to the host. Does not
+     * return useful information for CPU.
+     */
+    Array<int32_t> getLUInfo() const {
+        mInfoLU.updateHost().wait();
+        return mInfoLU;
+    };
+
+    /**
+     * Checks validity of the solve operation. Does not return useful information for CPU.
+     */
+    int32_t validSolve() const { return mInfoSolve == 0; }
+};
+
+// /**
+//  * Gets the inverse of each A[i], using an already PLU factorized A[i].
+//  * Only available if compiling with CUDA.
+//  */
+// template <typename T>
+// void inverseBatch(const Array<T*>& batchA, const Array<T*>& batchC, const Array<int>&
+// pivots,
+//                   const Array<int>& info, const Shape shapeA, const Shape shapeC,
+//                   const uint stream = 0) {
+// #ifdef CUDA
+//     CT_ERROR_IF(shapeA.rows(), !=, shapeA.cols(),
+//           "'A' needs to be square, rows() and column need to match.");
+//     CT_ERROR_IF(shapeA.rows(), !=, shapeC.cols(), "'A' needs to be the same shape as
+//     'C'."); CT_ERROR_IF(shapeA.rows(), !=, shapeC.rows(), "'A' needs to be the same shape
+//     as 'C'.");
+
+//     CT_ERROR_IF(shapeA.rows(), !=, pivots.shape().rows(),
+//           "Rows()/columns of 'A' and rows() of pivots need to match.");
+//     CT_ERROR_IF(batchA.shape().rows(), !=, pivots.shape().cols(),
+//           "Batch size and columns of pivots need to match.");
+//     CT_ERROR_IF(info.shape().cols(), !=, 1, "Info needs to be a column vector.")
+//     CT_ERROR_IF(batchA.shape().rows(), !=, info.shape().rows(),
+//           "Batch size and length of info need to match.");
+//     CT_ERROR_IF(batchA.shape().rows(), !=, batchC.shape().rows(),
+//           "Batches 'A[i]' and 'C[i]' need to match.");
+
+//     std::string s = "cublas" + std::to_string(stream);
+//     CUBLAS_CHECK(
+//         cublasSetStream(Manager::get()->cublasHandle(),
+//         Manager::get()->stream(s)));
+//     invoke<T>(cublasSgetriBatched, cublasDgetriBatched,
+//     Manager::get()->cublasHandle(),
+//               shapeA.rows(), batchA.dataDevice(), shapeA.rows(), pivots.dataDevice(),
+//               batchC.dataDevice(), shapeC.rows(), info.dataDevice(),
+//               batchA.shape().rows());
+// #else
+//     CT_ERROR_IF(true, ==, true, "inverseBatch is not callable without CUDA.");
+// #endif
+// }
+
+}; // namespace BLAS
+}; // namespace CudaTools
+
+#endif
--- a/Core.h
+++ b/Core.h
@ -0,0 +1,544 @@
+#ifndef CUDATOOLS_H
+#define CUDATOOLS_H
+
+#include "Macros.h"
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace CudaTools {
+
+/**
+ * Simple wrapper for the name of a stream. Its purposes is to allow for
+ * 'streams' to be passed on host code, and allowing for simple syntax
+ * for waiting.
+ */
+struct StreamID {
+  public:
+    std::string id;
+    StreamID() : id(""){};
+    /**
+     * The constructor for a StreamID.
+     */
+    StreamID(const std::string& id_) : id(id_){};
+    StreamID(const char* id_) : id(id_){};
+
+    /**
+     * Waits for the stream with this stream ID.
+     */
+    void wait() const;
+};
+
+static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
+static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
+static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
+
+/**
+ * Allocates memory on the device.
+ */
+void* malloc(const size_t size);
+
+/**
+ * Pins memory on the host.
+ */
+void pin(void* const pHost, const size_t size);
+
+/**
+ * Pushes memory from the device to the host.
+ */
+StreamID push(void* const pHost, void* const pDevice, const size_t size,
+              const StreamID& stream = DEF_MEM_STREAM);
+/**
+ * Pulls memory from the device back to the host.
+ */
+StreamID pull(void* const pHost, void* const pDevice, const size_t size,
+              const StreamID& stream = DEF_MEM_STREAM);
+/**
+ * Copies memory on the device to another location on the device.
+ */
+StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
+                    const StreamID& stream = DEF_MEM_STREAM);
+
+/**
+ * Frees memory on the device.
+ */
+void free(void* const pDevice);
+
+#ifdef CUDACC
+cudaDeviceProp getDeviceProp();
+static cudaDeviceProp DeviceProperties = getDeviceProp();
+const char* cublasGetErrorString(cublasStatus_t status);
+#endif
+
+/**
+ * A class that manages various CUDA Runtime components, such as
+ * streams, events, and handles.
+ */
+class Manager {
+  private:
+    static Manager mManagerInstance;
+    Manager(const std::vector<std::string>& names);
+    ~Manager();
+#ifdef CUDACC
+    std::unordered_map<std::string, cudaStream_t> mStreams;
+    cublasHandle_t mCublas;
+#endif
+  public:
+    /**
+     * Used to get the global CudaTools::Manager instance.
+     */
+    static Manager* get() { return &mManagerInstance; };
+
+    void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
+    void sync() const;                          /**< Waits until all device code has finished. */
+    void addStream(const std::string& name);    /**< Creates a stream with the given name. */
+#ifdef CUDACC
+    cudaStream_t stream(const StreamID& stream) const;
+    cublasHandle_t cublasHandle() const;
+#endif
+};
+
+namespace Kernel {
+
+/**
+ * A struct that contains the kernel launch parameters.
+ */
+struct Settings {
+  public:
+#ifdef CUDACC
+    dim3 blockGrid;
+    dim3 threadBlock;
+    size_t sharedMemoryBytes = 0;
+#else
+    size_t threads;
+#endif
+    StreamID stream;
+
+    Settings() = default;
+
+    void setGridDim(const size_t x);                 /**< Sets the Grid dimensions. */
+    void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
+    void setGridDim(const size_t x, const size_t y,
+                    const size_t z);                  /**< Sets the Grid dimensions. */
+    void setBlockDim(const size_t x);                 /**< Sets the Thread Block dimensions. */
+    void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
+    void setBlockDim(const size_t x, const size_t y,
+                     const size_t z); /**< Sets the Thread Block dimensions. */
+
+    void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
+    void setStream(const StreamID& stream);    /**< Sets the stream. */
+};
+
+/**
+ * Returns a kernel launch parameters based on the number of threads, and optionally
+ * a stream. Should only be used for 'embarassingly parallel' situations, or where
+ * each thread corresponds some sort of index.
+ */
+Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);
+
+}; // namespace Kernel
+
+template <typename T> class Array;
+
+/**
+ * A class that holds information about an Array.
+ */
+class Shape {
+  private:
+    template <typename T> friend class Array;
+    uint32_t mAxes;
+    uint32_t mItems;
+    uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
+    uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};
+
+  public:
+    HD Shape() : mAxes(0), mItems(1){};
+    /**
+     * The constructor for a Shape.
+     * \param dims an initializer list of the dimensions.
+     */
+    HD Shape(const std::initializer_list<uint32_t> dims);
+
+    HD uint32_t axes() const;  /**< Gets the number of axes. */
+    HD uint32_t items() const; /**< Gets the total number of items. */
+
+    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
+                                   of the last axis. */
+    HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
+                                   dimension of the second to last axis. */
+    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets the
+                                   dimension of the second to last axis. */
+
+    HD uint32_t
+    dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
+    HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */
+
+    /**
+     * Gets the shape at a specific axis of this shape.
+     * \param axis the axis of where the new shape starts.
+     */
+    HD Shape subshape(const uint32_t axis) const;
+
+    HD bool operator==(const Shape& s) const; /**< Equals operator. */
+    HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
+};
+
+std::ostream& operator<<(std::ostream& out, const Shape& s);
+
+}; // namespace CudaTools
+
+#ifdef CUDATOOLS_IMPLEMENTATION
+
+namespace CudaTools {
+
+template <typename T, typename... Args>
+StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) {
+#ifdef CUDA
+    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
+           Manager::get()->stream(sett.stream.id)>>>(args...);
+#else
+    func(args...);
+#endif
+    return sett.stream;
+}
+
+////////////////////
+// Memory Methods //
+////////////////////
+
+void StreamID::wait() const { Manager::get()->waitFor(id); }
+
+void* malloc(const size_t size) {
+#ifdef CUDACC
+    void* pDevice;
+    CUDA_CHECK(cudaMalloc(&pDevice, size));
+    return pDevice;
+#else
+    return nullptr;
+#endif
+}
+
+void free(void* const pDevice) {
+#ifdef CUDACC
+    if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
+#endif
+}
+
+StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
+#ifdef CUDACC
+    CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
+                               Manager::get()->stream(stream.id)));
+#endif
+    return stream;
+}
+
+StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
+#ifdef CUDACC
+    CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
+                               Manager::get()->stream(stream.id)));
+#endif
+    return stream;
+}
+
+StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
+                    const StreamID& stream) {
+#ifdef CUDACC
+    CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
+                               Manager::get()->stream(stream.id)));
+#endif
+    return stream;
+}
+
+void pin(void* const pHost, const size_t size) {
+#ifdef CUDACC
+    CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
+#endif
+}
+
+#ifdef CUDACC
+cudaDeviceProp getDeviceProp() {
+    cudaSetDevice(0);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+    return deviceProp;
+}
+#endif
+
+/////////////////////
+// Manager Methods //
+/////////////////////
+
+Manager::Manager(const std::vector<std::string>& names) {
+#ifdef CUDACC
+    for (auto name : names) {
+        addStream(name);
+    }
+    CUBLAS_CHECK(cublasCreate(&mCublas));
+#endif
+}
+
+Manager::~Manager() {
+#ifdef CUDACC
+    for (auto& it : mStreams) {
+        CUDA_CHECK(cudaStreamDestroy(it.second));
+    }
+    CUBLAS_CHECK(cublasDestroy(mCublas));
+#endif
+}
+
+void Manager::waitFor(const StreamID& stream) const {
+#ifdef CUDACC
+    auto it = mStreams.find(stream.id);
+    if (it != mStreams.end()) {
+        CUDA_CHECK(cudaStreamSynchronize(it->second));
+    } else {
+        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
+    }
+#endif
+}
+
+void Manager::sync() const {
+#ifdef CUDACC
+    CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+}
+
+void Manager::addStream(const std::string& name) {
+#ifdef CUDACC
+    cudaStream_t s;
+    CUDA_CHECK(cudaStreamCreate(&s));
+    mStreams[name] = s;
+#endif
+}
+
+#ifdef CUDACC
+cudaStream_t Manager::stream(const StreamID& stream) const {
+    auto it = mStreams.find(stream.id);
+    if (it != mStreams.end()) {
+        return it->second;
+    } else {
+        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
+    }
+}
+
+cublasHandle_t Manager::cublasHandle() const { return mCublas; };
+
+Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
+#else
+Manager Manager::mManagerInstance = Manager({""});
+#endif
+
+////////////////////
+// Kernel Methods //
+////////////////////
+
+namespace Kernel {
+
+void Settings::setGridDim(const size_t x) {
+#ifdef CUDACC
+    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
+    blockGrid.x = x;
+    blockGrid.y = 1;
+    blockGrid.z = 1;
+#endif
+}
+
+void Settings::setGridDim(const size_t x, const size_t y) {
+#ifdef CUDACC
+    CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
+    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
+    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
+    blockGrid.x = x;
+    blockGrid.y = y;
+    blockGrid.z = 1;
+#endif
+}
+
+void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
+#ifdef CUDACC
+    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
+    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
+    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
+    CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
+    blockGrid.x = x;
+    blockGrid.y = y;
+    blockGrid.z = z;
+#endif
+}
+
+void Settings::setBlockDim(const size_t x) {
+#ifdef CUDACC
+    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
+    threadBlock.x = x;
+    threadBlock.y = 1;
+    threadBlock.z = 1;
+#endif
+}
+
+void Settings::setBlockDim(const size_t x, const size_t y) {
+#ifdef CUDACC
+    CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
+    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
+    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
+    threadBlock.x = x;
+    threadBlock.y = y;
+    threadBlock.z = 1;
+#endif
+}
+
+void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
+#ifdef CUDACC
+    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
+    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
+    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
+    CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
+    threadBlock.x = x;
+    threadBlock.y = y;
+    threadBlock.z = z;
+#endif
+}
+
+void Settings::setSharedMemSize(const size_t bytes) {
+#ifdef CUDACC
+    sharedMemoryBytes = bytes;
+#endif
+}
+
+void Settings::setStream(const StreamID& stream_) {
+#ifdef CUDACC
+    stream.id = stream_.id;
+#endif
+}
+
+Settings basic(const size_t threads, const StreamID& stream) {
+    Settings sett;
+#ifdef CUDACC
+    auto max_threads = DeviceProperties.maxThreadsPerBlock;
+    size_t grid_blocks = (threads + max_threads - 1) / max_threads;   // ceil(threads / max_threads)
+    size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
+    sett.setGridDim(grid_blocks);
+    sett.setBlockDim(block_threads);
+    sett.setStream(stream);
+#else
+    sett.threads = threads;
+#endif
+    return sett;
+}
+} // namespace Kernel
+
+/////////////////////
+// Shape Functions //
+/////////////////////
+
+HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
+    CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
+    mAxes = dims.size();
+    if (mAxes == 0) return;
+
+    auto it = dims.end() - 1;
+    mItems = 1;
+    for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
+        uint32_t dim = *it;
+        CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");
+
+        mAxisDim[iAxis] = dim;
+        mStride[iAxis] = mItems;
+        mItems *= dim;
+        --it;
+    }
+
+    if (mAxes == 1) return;
+    // Swap last two, for column major storage.
+    mStride[mAxes - 2] = 1;
+    mStride[mAxes - 1] = mAxisDim[mAxes - 2];
+}
+
+HD uint32_t Shape::axes() const { return mAxes; };
+HD uint32_t Shape::items() const { return mItems; };
+HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }
+
+HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }
+
+HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }
+
+HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
+HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }
+
+HD bool Shape::operator==(const Shape& s) const {
+    if (mAxes != s.mAxes) {
+        return false;
+    }
+    for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
+        if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }
+
+HD Shape Shape::subshape(const uint32_t axis) const {
+    CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
+    if (axis == mAxes) return Shape({1});
+
+    Shape new_shape({});
+    new_shape.mAxes = mAxes - axis;
+    new_shape.mItems = mItems;
+
+    for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
+        new_shape.mItems /= mAxisDim[iAxis];
+    }
+    for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
+        new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
+        new_shape.mStride[iAxis - axis] = mStride[iAxis];
+    }
+    return new_shape;
+}
+
+std::ostream& operator<<(std::ostream& out, const Shape& s) {
+    out << "(";
+    if (s.axes() == 0) return out << ")";
+    for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
+        out << s.dim(iAxis) << ", ";
+    }
+    return out << s.dim(s.axes() - 1) << ")";
+}
+
+#ifdef CUDACC
+const char* cublasGetErrorString(cublasStatus_t error) {
+    switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+        return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+        return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+        return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+        return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+        return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+        return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+        return "CUBLAS_STATUS_INTERNAL_ERROR";
+    }
+
+    return "<unknown>";
+}
+#endif
+
+};     // namespace CudaTools
+#endif // CUDATOOLS_IMPLEMENTATION
+
+#endif // CUDATOOLS_H
--- a/Macros.h
+++ b/Macros.h
@ -0,0 +1,297 @@
+#ifndef MACROS_H
+#define MACROS_H
+
+#include <exception>
+#include <sstream>
+#include <stdarg.h>
+
+#if defined(CUDA) && defined(__CUDACC__)
+#define CUDACC
+#endif
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
+#define DEVICE
+#endif
+
+#ifdef CUDATOOLS_DOXYGEN
+/**
+ * \def CUDACC
+ * This macro is defined when this code is being compiled by nvcc and the CUDA compilation
+ * flag is set. This should be used to enclose code where CUDA specific libraries and syntax are
+ * being used.
+ */
+#define CUDACC
+
+/**
+ * \def DEVICE
+ * This macro is defined when this code is being compiled for the device. The difference between
+ * this and CUDACC is that this should exclusively be used to dcide if code is being compiled
+ * to execute on the device. CUDACC is only determines what compiler is being used.
+ */
+#define DEVICE
+
+/**
+ * \def HD
+ * Mark a function in front with this if it needs to be callable on both the
+ * CPU and CUDA device.
+ */
+#define HD
+
+/**
+ * \def SHARED
+ * Mark a variable as static shared memory.
+ */
+#define SHARED
+
+/**
+ * \def DECLARE_KERNEL(call, ...)
+ * Used to declare (in header) a CUDA kernel.
+ * \param call the name of the kernel
+ * \param ... the arguments of the kernel
+ */
+#define DECLARE_KERNEL(call, ...)
+
+/**
+ * \def DEFINE_KERNEL(call, ...)
+ * Used to define (in implementation) a CUDA kernel.
+ * \param call the name of the kernel
+ * \param ... the arguments of the kernel
+ */
+#define DEFINE_KERNEL(call, ...)
+
+/**
+ * \def KERNEL(call, settings, ...)
+ * Used to call a CUDA kernel.
+ * \param call the name of the kernel
+ * \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with
+ * \param ... the arguments of the kernel
+ */
+#define KERNEL(call, settings, ...)
+
+/**
+ * \def BASIC_LOOP(N)
+ * Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly
+ * parallel situations. Exposes the loop/thread number as iThread.
+ * \param N number of iterations
+ */
+#define BASIC_LOOP(N)
+
+/**
+ * \def DEVICE_CLASS(name)
+ * Can be used inside a class declaration (header) which generates boilerplate code to allow this
+ * class to be used on the device.
+ *
+ * This macro creates a few functions:\n
+ * name* that(): returns the pointer to this instance on the device.
+ *
+ * void allocateDevice(): allocates the memory on the device for this class instance.
+ *
+ * CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance
+ * of the class.
+ *
+ * CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates
+ * the device instance of the class.
+ * \param name the name of the class
+ */
+#define DEVICE_CLASS(name)
+
+/**
+ * \def CT_ERROR_IF(a, op, b, msg)
+ * Used for throwing runtime errors given a condition with an operator.
+ */
+#define CT_ERROR_IF(a, op, b, msg)
+
+/**
+ * \def CT_ERROR(a, msg)
+ * Used for throwing runtime errors given a bool.
+ */
+#define CT_ERROR(a, msg)
+
+/**
+ * \def CUDA_CHECK(call)
+ * Gets the error generated by a CUDA function call if there is one.
+ * \param call CUDA function to check if there are errors when running.
+ */
+#define CUDA_CHECK(call)
+
+/**
+ * \def CUBLAS_CHECK(call)
+ * Gets the error generated by a cuBLAS function call if there is one.
+ * \param call cuBLAS function to check if there are errors when running.
+ */
+#define CUBLAS_CHECK(call)
+
+/**
+ * \def CUDA_MEM(call)
+ * Gets the GPU memory used from function call if there is one.
+ * \param call function to measure memory usage.
+ * \param name an identifier to use as a variable and when printing. Must satisfy variable naming.
+ */
+#define CUDA_MEM(call, name)
+#endif
+
+///////////////////
+// KERNEL MACROS //
+///////////////////
+
+#ifdef CUDACC
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#define HD __host__ __device__
+#define SHARED __shared__
+
+#define DECLARE_KERNEL(call, ...) __global__ void call(__VA_ARGS__)
+
+#define DEFINE_KERNEL(call, ...)                                                                   \
+    template CudaTools::StreamID CudaTools::runKernel(                                             \
+        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
+    __global__ void call(__VA_ARGS__)
+
+#else
+#define HD
+#define SHARED
+
+#define DECLARE_KERNEL(call, ...) void call(__VA_ARGS__)
+
+#define DEFINE_KERNEL(call, ...)                                                                   \
+    template CudaTools::StreamID CudaTools::runKernel(                                             \
+        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
+    void call(__VA_ARGS__)
+
+#endif // CUDACC
+
+#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
+
+///////////////////
+// DEVICE MACROS //
+///////////////////
+
+#ifdef DEVICE
+
+#define BASIC_LOOP(N)                                                                              \
+    uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x;                                      \
+    if (iThread < N)
+#else
+#define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread)
+
+#endif
+
+//////////////////
+// CLASS MACROS //
+//////////////////
+
+#define UPDATE_FUNC(name)                                                                          \
+    inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream =                      \
+                                              CudaTools::DEF_MEM_STREAM) {                         \
+        return CudaTools::pull(this, that(), sizeof(name));                                        \
+    };                                                                                             \
+    inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream =                    \
+                                                CudaTools::DEF_MEM_STREAM) {                       \
+        return CudaTools::push(this, that(), sizeof(name));                                        \
+    }
+
+#ifdef CUDA
+
+#define DEVICE_CLASS(name)                                                                         \
+  private:                                                                                         \
+    name* __deviceInstance__ = nullptr;                                                            \
+                                                                                                   \
+  public:                                                                                          \
+    inline name* that() { return __deviceInstance__; }                                             \
+    inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \
+    UPDATE_FUNC(name)
+
+#else
+#define DEVICE_CLASS(name)                                                                         \
+  public:                                                                                          \
+    inline name* that() { return this; };                                                          \
+    inline void allocateDevice(){};                                                                \
+    UPDATE_FUNC(name)
+
+#endif
+
+#ifndef CUDATOOLS_ARRAY_MAX_AXES
+/**
+ * \def CUDATOOLS_ARRAY_MAX_AXES
+ * The maximum number of axes/dimensions an CudaTools::Array can have. The default is
+ * set to 4, but can be manully set fit the program needs.
+ */
+#define CUDATOOLS_ARRAY_MAX_AXES 4
+#endif
+
+////////////////////
+// Error Checking //
+////////////////////
+
+#ifndef NO_DIMENSION_CHECK
+#ifdef DEVICE
+#define CT_ERROR_IF(a, op, b, msg)                                                                 \
+    if (a op b) {                                                                                  \
+        printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg);     \
+    }
+
+#define CT_ERROR(a, msg)                                                                           \
+    if (a) {                                                                                       \
+        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
+    }
+#else
+
+#define CT_ERROR_IF(a, op, b, msg)                                                                 \
+    if (a op b) {                                                                                  \
+        std::ostringstream os_a;                                                                   \
+        std::ostringstream os_b;                                                                   \
+        os_a << a;                                                                                 \
+        os_b << b;                                                                                 \
+        printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg,  \
+               os_a.str().c_str(), os_b.str().c_str());                                            \
+        throw std::exception();                                                                    \
+    }
+
+#define CT_ERROR(a, msg)                                                                           \
+    if (a) {                                                                                       \
+        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
+        throw std::exception();                                                                    \
+    }
+#endif
+
+#endif // NO_DIMENSION_CHECK
+
+#if defined(CUDACC) && !defined(NO_CUDA_CHECK)
+
+#define CUDA_CHECK(call)                                                                           \
+    do {                                                                                           \
+        cudaError_t err = (call);                                                                  \
+        if (err != cudaSuccess) {                                                                  \
+            printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err));          \
+            throw std::exception();                                                                \
+        }                                                                                          \
+    } while (0)
+
+#define CUBLAS_CHECK(call)                                                                         \
+    do {                                                                                           \
+        cublasStatus_t err = (call);                                                               \
+        if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
+            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__,                                  \
+                   CudaTools::cublasGetErrorString(err));                                          \
+            throw std::exception();                                                                \
+        }                                                                                          \
+    } while (0)
+
+#define CUDA_MEM(call, name)                                                                       \
+    size_t free_bef_##name, free_aft_##name;                                                       \
+    cudaMemGetInfo(&free_bef_##name, NULL);                                                        \
+    call;                                                                                          \
+    CudaTools::Manager::get()->sync();                                                             \
+    cudaMemGetInfo(&free_aft_##name, NULL);                                                        \
+    printf("[%s] GPU Memory Usage: %iMiB\n", #name,                                                \
+           (free_bef_##name - free_aft_##name) / (1024 * 1024));
+
+#else
+#define CUDA_CHECK(call) (call)
+#define CUBLAS_CHECK(call) (call)
+#define CUDA_MEM(call, name) (call)
+#endif
+
+#endif // MACROS_H
--- a/95
+++ b/95
@ -0,0 +1,95 @@
+CC := g++-10
+NVCC := nvcc
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
+NVCC_FLAGS := -MMD -w -Xcompiler
+
+INCLUDE :=
+LIBS_DIR :=
+LIBS_DIR_GPU := /usr/local/cuda/lib64
+LIBS :=
+LIBS_GPU := cuda cudart cublas
+
+TARGET = tests
+SRC_DIR = .
+BUILD_DIR = build
+
+# Should not need to modify below.
+
+CPU_BUILD_DIR = $(BUILD_DIR)/cpu
+GPU_BUILD_DIR = $(BUILD_DIR)/gpu
+
+SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
+
+# Get source files and object files.
+GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
+NVCC_SRC = $(filter %.cu.cpp, $(SRC))
+GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+
+# If compiling for CPU, all go to GCC. Otherwise, they are split.
+CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
+GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
+GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
+
+# $(info $$GCC_SRC is [${GCC_SRC}])
+# $(info $$NVCC_SRC is [${NVCC_SRC}])
+# $(info $$GCC_OBJ is [${GCC_OBJ}])
+# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
+
+# $(info $$CPU_OBJ is [${CPU_OBJ}])
+# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
+# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
+
+HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
+CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
+GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
+
+INC := $(INCLUDE:%=-I%)
+LIB := $(LIBS_DIR:%=-L%)
+LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
+LD := $(LIBS:%=-l%)
+LD_GPU := $(LIBS_GPU:%=-l%)
+
+# Reminder:
+# $< = first prerequisite
+# $@ = the target which matched the rule
+# $^ = all prerequisites
+
+.PHONY: all clean
+
+all : cpu gpu
+
+cpu: $(TARGET)CPU
+gpu: $(TARGET)GPU
+
+$(TARGET)CPU: $(CPU_OBJ)
+	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
+
+$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
+
+# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
+# regular ones. Then, we link them all together.
+$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
+	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
+
+$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
+	$(NVCC) --device-link $^ -o $@
+
+$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
+	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
+
+$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
+
+-include $(CPU_DEPS)
+-include $(GPU_DEPS)
+
+$(CPU_BUILD_DIR):
+	mkdir -p $@
+
+$(GPU_BUILD_DIR):
+	mkdir -p $@
+
+clean:
+	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,40 @@
+=========
+CudaTools
+=========
+This is the documentation for CudaTools, a header-only library and framework
+for the development of CPU-CUDA compatible applications. Using CudaTools enables
+the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to
+introduce ``#ifdef`` statements when code is essentially identical between the targets.
+
+For information on the library itself and its usage, view `documentation <https://acem.ece.illinois.edu/code/CudaTools>`__. The small code snippets and samples
+seen in the documentation are in the folder ``samples``.
+
+Dependencies
+============
+- Eigen
+
+In the future, we will make this dependency optional, but still provide support
+for it. As of now, it is necessary.
+
+Building the Documentation
+==========================
+The documentation is built with `Doxygen <https://doxygen.nl/>`__ and `Sphinx <https://www.sphinx-doc.org/en>`__.
+So, first make sure you have Doxygen installed on your system, and make sure it is added
+to your system path. Then, you will have to create a Python virtual environment
+in the repository folder
+
+.. code-block:: bash
+
+    $ python3 -m venv .venv
+
+After installing the required Python packages
+
+.. code-block:: bash
+
+    $ pip install -r requirements
+
+you can now run the script
+
+.. code-block:: bash
+
+    $ ./build_docs
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+doxygen docs/Doxyfile
+sphinx-build -b html docs/source docs/build/html
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/make.bat
+++ b/docs/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/docs/source/_static/js/mathjax-config.js
+++ b/docs/source/_static/js/mathjax-config.js
@ -0,0 +1,39 @@
+sections = {
+  "mesh_prep": 1,
+  "matrix_assembly": 2,
+  "bc_calc": 3,
+  "timestep": 4,
+}
+
+
+window.MathJax = {
+  loader: {load: ['[tex]/tagformat', '[tex]/ams']},
+  tex: {
+    packages: {'[+]': ['tagformat', 'ams']},
+    macros: {
+      dd: "{\\, \\mathrm{d}}",
+      E: "{\\mathbf{E}}",
+      H: "{\\mathbf{H}}",
+      J: "{\\mathbf{J}}",
+      D: "{\\mathbf{D}}",
+      B: "{\\mathbf{B}}",
+      M: "{\\mathbf{M}}",
+      tbE: "{\\tilde{\\E}}",
+      tbH: "{\\tilde{\\H}}",
+      tE: "{\\tilde{E}}",
+      tH: "{\\tilde{H}}",
+      tphi: "{\\tilde{\\phi}}",
+      curl: ["{\\nabla \\times {#1}}", 1],
+      div: ["{\\nabla \\cdot {#1}}", 1],
+      tens: ["{\\bar{\\bar{{#1}}}}", 1],
+    },
+    tags: 'ams',
+    tagformat: {
+      number: (n) => sections[window.location.pathname.split("/").pop().split(".")[0]] + '.' + n,
+    },
+    ams: {
+      multilineWidth: '100%',
+      multilineIndent: '50em'
+    }
+  },
+}
--- a/docs/source/array.rst
+++ b/docs/source/array.rst
@ -0,0 +1,26 @@
+=======
+Array.h
+=======
+
+The ``Array.h`` header file contains the Array class, and its related classes. For this
+file only, assume that every functions is callable on both host and device unless
+explicitly mentioned otherwise.
+
+CudaTools::Shape
+----------------
+.. doxygenclass:: CudaTools::Shape
+    :members:
+    :allow-dot-graphs:
+
+CudaTools::ArrayIterator<T>
+---------------------------
+.. doxygenclass:: CudaTools::ArrayIterator
+    :members:
+    :allow-dot-graphs:
+
+CudaTools::Array<T>
+-------------------
+.. doxygenclass:: CudaTools::Array
+    :members:
+    :private-members:
+    :allow-dot-graphs:
--- a/docs/source/blas.rst
+++ b/docs/source/blas.rst
@ -0,0 +1,45 @@
+======
+BLAS.h
+======
+
+The ``BLAS.h`` header file contains some BLAS functions, and some related
+classes for those functions.
+
+BLAS Functions
+==============
+Currently, these are the supported BLAS functions. They are inherited mainly
+from the cuBLAS API, and condensed into a unified functions. The plan is to
+add them as necessary.
+
+CudaTools::BLAS::GEMV<T>
+------------------------
+.. doxygenfunction:: CudaTools::BLAS::GEMV
+
+CudaTools::BLAS::GEMM<T>
+------------------------
+.. doxygenfunction:: CudaTools::BLAS::GEMM
+
+CudaTools::BLAS::DGMM<T>
+------------------------
+.. doxygenfunction:: CudaTools::BLAS::DGMM
+
+BLAS Classes
+============
+
+These classes also inherit functions from the cuBLAS API, but are packaged
+into classes that are more intuitive and hide external details.
+
+CudaTools::BLAS::Batch<T>
+-------------------------
+.. doxygenclass:: CudaTools::BLAS::Batch
+    :members:
+
+CudaTools::BLAS::PLUArray<T>
+----------------------------
+.. doxygenclass:: CudaTools::BLAS::PLUArray
+    :members:
+
+CudaTools::BLAS::PLUBatch<T>
+----------------------------
+.. doxygenclass:: CudaTools::BLAS::PLUBatch
+    :members:
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -0,0 +1,53 @@
+# Configuration file for the Sphinx documentation builder.
+
+# -- Project information
+
+project = 'DGEMS'
+copyright = '2022'
+author = 'Kenneth Jao, Qi Jian Lim'
+
+release = '0.1'
+version = '0.1.0'
+
+# -- General configuration
+
+html_static_path = ["_static"]
+html_js_files = ["js/mathjax-config.js"]
+
+extensions = [
+    'sphinx.ext.duration',
+    'sphinx.ext.doctest',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.autosectionlabel',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.graphviz',
+    'sphinxcontrib.bibtex',
+    'breathe',
+]
+
+breathe_projects = {"DGEMS": "../build/xml"}
+breathe_default_project = "DGEMS"
+
+bibtex_bibfiles = ['refs.bib']
+
+mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3/', None),
+    'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
+}
+intersphinx_disabled_domains = ['std']
+
+templates_path = ['_templates']
+
+# -- Options for HTML output
+
+html_theme = 'sphinx_rtd_theme'
+html_theme_options = {
+    'collapse_navigation': False,
+}
+
+# -- Options for EPUB output
+epub_show_urls = 'footnote'
--- a/docs/source/core.rst
+++ b/docs/source/core.rst
@ -0,0 +1,67 @@
+======
+Core.h
+======
+
+The ``Core.h`` header file defines several compiler flags and macros along with
+a few core classes.
+
+Flags
+=====
+
+Device Indicators
+-----------------
+.. doxygendefine:: CUDACC
+.. doxygendefine:: DEVICE
+
+Host-Device Automation
+----------------------
+.. doxygendefine:: HD
+.. doxygendefine:: SHARED
+
+Compilation Options
+-------------------
+.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
+
+Macros
+======
+
+Kernel
+------
+.. doxygendefine:: DECLARE_KERNEL
+.. doxygendefine:: DEFINE_KERNEL
+.. doxygendefine:: KERNEL
+
+Device Helpers
+--------------
+
+.. doxygendefine:: BASIC_LOOP
+
+Device Class
+------------
+
+.. doxygendefine:: DEVICE_CLASS
+
+
+Classes and Structs
+===================
+
+CudaTools::StreamID
+-------------------
+
+.. doxygenstruct:: CudaTools::StreamID
+
+CudaTools::Manager
+------------------
+
+.. doxygenclass:: CudaTools::Manager
+    :members:
+
+CudaTools::Kernel::Settings
+---------------------------
+
+.. doxygenstruct:: CudaTools::Kernel::Settings
+    :members:
+
+CudaTools::Kernel::Basic
+------------------------
+.. doxygenfunction:: CudaTools::Kernel::basic
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -0,0 +1,25 @@
+=========
+CudaTools
+=========
+This is the documentation for CudaTools, a header-only library and framework
+for the development of CPU-CUDA compatible applications. Using CudaTools enables
+the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to
+introduce ``#ifdef`` statements when code is essentially identical between the targets.
+
+To get started, please head over to the :doc:`usage` section. For more detail on the
+machinery underneath, please refer to the other other sections.
+
+.. note::
+
+   If you would like to contribute, please visit the `git page <https://git.acem.ece.illinois.edu/kjao/CudaTools>`__.
+
+Contents
+========
+
+.. toctree::
+   :maxdepth: 2
+
+   usage
+   core
+   array
+   blas
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@ -0,0 +1,128 @@
+==================
+Usage and Examples
+==================
+
+
+This library is broken up into three main parts, as well as a certain
+compilation and linking framework:
+
+#. :ref:`Core Examples`
+#. :ref:`Array Examples`
+#. :ref:`BLAS Examples`
+#. :ref:`Compilation and Linking`
+
+The ``Core.h`` header contains the necessary macros, flags and objects for interfacing with
+basic kernel launching and the CUDA Runtime API. The ``Array.h`` header contains the ``CudaTools::Array``
+class which provides a device compatible Array-like class with easy memory management. Lastly,
+the ``BLAS.h`` header provides functions BLAS functions through the the cuBLAS library for the GPU,
+and Eigen for the CPU. Lastly, a templated Makefile is provided which can be used
+for your own project, after following a few rules.
+
+The usage of this libary will be illustrated through examples, and further details
+can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
+Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
+to a function runnable on the CPU, and a device function refers to a function that is runnable
+on a device. A kernel is a specific function that the host can call to be run on the device.
+
+Core Examples
+=============
+This file mainly introduces compiler macros and a few classes that are used to improve the
+syntax between host and device code. To define and call a kernel, there are a few
+macros provided. For example,
+
+.. code-block:: cpp
+
+    DEFINE_KERNEL(add, int x, int y) {
+        printf("Kernel: %i\n", x + y);
+    }
+
+    int main() {
+        KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
+        return 0;
+    }
+
+The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
+The second argument in the ``KERNEL()`` macro is are the launch parameters for
+kernel. The launch parameters have several items, but for 'embarassingly parallel'
+cases, we can simply generate the settings with the number of threads. More detail with
+creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
+there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
+see :ref:`here <Macros>`.
+
+.. warning::
+   These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
+   for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
+   and make it available to other files.
+
+Since many applications used classes, a macro is provided to 'convert' a class into
+being device-compatible. Following the previous example similarly,
+
+.. code-block:: cpp
+
+    class intPair {
+        DEVICE_CLASS(intPair)
+        public:
+            int x, y;
+
+            intPair(const int x_, const int y_) : x(x_), y(y_) {
+                allocateDevice(); // Allocates memory for this intPair on the device.
+                updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
+            };
+
+            HD void swap() {
+                int swap = x;
+                x = y;
+                y = swap;
+            };
+    };
+
+    DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
+
+    int main() {
+        intPair pair(1, 2);
+        printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
+
+        KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
+        pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
+
+        printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
+        return 0;
+    }
+
+In this example, we create a class called ``intPair``, which is then made available on the device through
+the ``DEVICE_CLASS(name)`` macro. Specifically, that macro introduces a few functions, like
+``allocateDevice()``, ``updateDevice()``, ``updateHost()``, and ``that()``. That last function
+returns a pointer to the copy on the device. For more details, see :ref:`here <Device Class>`. If we were to pass in the host pointer of the ``intPair`` to the kernel, there would be a illegal memory access.
+
+The kernel argument list should **must** consist of pointers to objects, or a non-reference object.
+Otherwise, compilation will fail. In general this is safer, as it forces the programmer to
+acknowledge that the device copy is being passed. For the latter case of a non-reference object,
+you should only do this if there is no issue in creating a copy of the original object. In the above
+example, we could have done this, but for more complicated classes it may result in unwanted behavior.
+
+Lastly, since the point of classes is usually to have some member functions, to have them
+available on the device, you must mark them with the compiler macro ``HD`` in front.
+
+We also introduce the ``wait()`` function, which waits for the command to complete before
+continuing. Most calls that involve the device are asynchronous, so without proper blocking,
+operations dependent on a previous command are not guaranteed to run correctly. If the code is
+compiled for CPU, then everything will run synchronously, as per usual.
+
+.. note::
+   Almost all functions that are asynchronous provide an optional 'stream' argument,
+   where you can give the name of the stream you wish to use. Different streams run
+   asynchronous, but operations on the same stream are FIFO. To define a stream to use
+   later, you must call ``CudaTools::Manager::get()->addStream("myStream")`` at some point
+   before you use it. For more details, see :ref:`here <CudaTools::Manager>`.
+
+
+Array Examples
+==============
+
+
+BLAS Examples
+=============
+
+
+Compilation and Linking
+=======================
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+Sphinx>=5.1.1
+sphinx-rtd-theme>=1.0.0
+sphinxcontrib-bibtex>=2.5.0
+breathe>=4.34.0
--- a/samples/1_CoreKernel/Makefile
+++ b/samples/1_CoreKernel/Makefile
@ -0,0 +1,95 @@
+CC := g++-10
+NVCC := nvcc
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
+NVCC_FLAGS := -MMD -w -Xcompiler
+
+INCLUDE := ../../
+LIBS_DIR :=
+LIBS_DIR_GPU := /usr/local/cuda/lib64
+LIBS :=
+LIBS_GPU := cuda cudart cublas
+
+TARGET = coreKernel
+SRC_DIR = .
+BUILD_DIR = build
+
+# Should not need to modify below.
+
+CPU_BUILD_DIR = $(BUILD_DIR)/cpu
+GPU_BUILD_DIR = $(BUILD_DIR)/gpu
+
+SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
+
+# Get source files and object files.
+GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
+NVCC_SRC = $(filter %.cu.cpp, $(SRC))
+GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+
+# If compiling for CPU, all go to GCC. Otherwise, they are split.
+CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
+GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
+GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
+
+# $(info $$GCC_SRC is [${GCC_SRC}])
+# $(info $$NVCC_SRC is [${NVCC_SRC}])
+# $(info $$GCC_OBJ is [${GCC_OBJ}])
+# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
+
+# $(info $$CPU_OBJ is [${CPU_OBJ}])
+# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
+# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
+
+HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
+CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
+GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
+
+INC := $(INCLUDE:%=-I%)
+LIB := $(LIBS_DIR:%=-L%)
+LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
+LD := $(LIBS:%=-l%)
+LD_GPU := $(LIBS_GPU:%=-l%)
+
+# Reminder:
+# $< = first prerequisite
+# $@ = the target which matched the rule
+# $^ = all prerequisites
+
+.PHONY: all clean
+
+all : cpu gpu
+
+cpu: $(TARGET)CPU
+gpu: $(TARGET)GPU
+
+$(TARGET)CPU: $(CPU_OBJ)
+	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
+
+$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
+
+# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
+# regular ones. Then, we link them all together.
+$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
+	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
+
+$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
+	$(NVCC) --device-link $^ -o $@
+
+$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
+	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
+
+$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
+
+-include $(CPU_DEPS)
+-include $(GPU_DEPS)
+
+$(CPU_BUILD_DIR):
+	mkdir -p $@
+
+$(GPU_BUILD_DIR):
+	mkdir -p $@
+
+clean:
+	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/samples/1_CoreKernel/main.cu.cpp
+++ b/samples/1_CoreKernel/main.cu.cpp
@ -0,0 +1,12 @@
+#define CUDATOOLS_IMPLEMENTATION
+#include <Core.h>
+
+DEFINE_KERNEL(add, int x, int y) {
+    printf("Kernel: %i\n", x + y);
+}
+
+int main() {
+    KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
+    return 0;
+}
+
--- a/samples/2_CoreClass/Makefile
+++ b/samples/2_CoreClass/Makefile
@ -0,0 +1,95 @@
+CC := g++-10
+NVCC := nvcc
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
+NVCC_FLAGS := -MMD -w -Xcompiler
+
+INCLUDE := ../../
+LIBS_DIR :=
+LIBS_DIR_GPU := /usr/local/cuda/lib64
+LIBS :=
+LIBS_GPU := cuda cudart cublas
+
+TARGET = coreClass
+SRC_DIR = .
+BUILD_DIR = build
+
+# Should not need to modify below.
+
+CPU_BUILD_DIR = $(BUILD_DIR)/cpu
+GPU_BUILD_DIR = $(BUILD_DIR)/gpu
+
+SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
+
+# Get source files and object files.
+GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
+NVCC_SRC = $(filter %.cu.cpp, $(SRC))
+GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+
+# If compiling for CPU, all go to GCC. Otherwise, they are split.
+CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
+GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
+GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
+
+# $(info $$GCC_SRC is [${GCC_SRC}])
+# $(info $$NVCC_SRC is [${NVCC_SRC}])
+# $(info $$GCC_OBJ is [${GCC_OBJ}])
+# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
+
+# $(info $$CPU_OBJ is [${CPU_OBJ}])
+# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
+# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
+
+HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
+CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
+GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
+
+INC := $(INCLUDE:%=-I%)
+LIB := $(LIBS_DIR:%=-L%)
+LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
+LD := $(LIBS:%=-l%)
+LD_GPU := $(LIBS_GPU:%=-l%)
+
+# Reminder:
+# $< = first prerequisite
+# $@ = the target which matched the rule
+# $^ = all prerequisites
+
+.PHONY: all clean
+
+all : cpu gpu
+
+cpu: $(TARGET)CPU
+gpu: $(TARGET)GPU
+
+$(TARGET)CPU: $(CPU_OBJ)
+	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
+
+$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
+
+# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
+# regular ones. Then, we link them all together.
+$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
+	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
+
+$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
+	$(NVCC) --device-link $^ -o $@
+
+$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
+	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
+
+$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
+
+-include $(CPU_DEPS)
+-include $(GPU_DEPS)
+
+$(CPU_BUILD_DIR):
+	mkdir -p $@
+
+$(GPU_BUILD_DIR):
+	mkdir -p $@
+
+clean:
+	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/samples/2_CoreClass/main.cu.cpp
+++ b/samples/2_CoreClass/main.cu.cpp
@ -0,0 +1,34 @@
+#define CUDATOOLS_IMPLEMENTATION
+#include <Core.h>
+
+class intPair {
+    DEVICE_CLASS(intPair)
+    public:
+        int x, y;
+
+        intPair(const int x_, const int y_) : x(x_), y(y_) {
+            allocateDevice(); // Allocates memory for this intPair on the device.
+            updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
+        };
+
+        HD void swap() {
+            int swap = x;
+            x = y;
+            y = swap;
+        };
+};
+
+DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
+
+int main() {
+    intPair pair(1, 2);
+    printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
+
+    KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
+    pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
+
+    printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
+    return 0;
+}
+
+
--- a/tests.cu.cpp
+++ b/tests.cu.cpp
@ -0,0 +1,494 @@
+#define CUDATOOLS_IMPLEMENTATION
+#define CUDATOOLS_ARRAY_MAX_AXES 8
+#include "Array.h"
+#include "BLAS.h"
+#include "Core.h"
+
+#include <Eigen/Core>
+#include <chrono>
+#include <complex>
+
+namespace CT = CudaTools;
+
+/////////////
+// Helpers //
+/////////////
+
+#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
+
+#define TIME_END(name)                                                                             \
+    auto end_##name = std::chrono::steady_clock::now();                                            \
+    auto time_ms_##name =                                                                          \
+        std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count();  \
+    auto time_mus_##name =                                                                         \
+        std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count();  \
+    if (time_ms_##name == 0) {                                                                     \
+        printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name);                            \
+    } else {                                                                                       \
+        printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name);                             \
+    }
+
+#define TIME(call, name)                                                                           \
+    TIME_START(name);                                                                              \
+    call;                                                                                          \
+    TIME_END(name);
+
+#define TEST(predicate, name, msg)                                                                 \
+    failed += (predicate) ? 0 : 1;                                                                 \
+    printf("[%s] ", (predicate) ? "\033[1;32mPASS\033[0m" : "\033[1;31mFAIL\033[0m");              \
+    printf("%s | %s.\n", name, msg);
+
+template <typename T> struct Type;
+
+#define REGISTER_PARSE_TYPE(X)                                                                     \
+    template <> struct Type<X> { static const std::string name; };                                 \
+    const std::string Type<X>::name = #X
+
+REGISTER_PARSE_TYPE(uint8_t);
+REGISTER_PARSE_TYPE(int16_t);
+REGISTER_PARSE_TYPE(int32_t);
+REGISTER_PARSE_TYPE(float);
+REGISTER_PARSE_TYPE(double);
+
+std::string box(std::string str) {
+    std::string tops(str.size() + 6, '#');
+    return tops + "\n## " + str + " ##\n" + tops;
+}
+
+std::string box2(std::string str) {
+    std::string tops(str.size() - 5, '-');
+    return tops + "\n|| " + str + " ||\n" + tops;
+}
+
+std::string boxSmall(std::string str) {
+    std::string tops(6, '-');
+    return tops + "[ " + str + " ]" + tops;
+}
+
+std::string separator() {
+    std::string line(40, '=');
+    return "\n" + line + "\n";
+}
+
+template <typename T> std::string type() { return "\033[1;96m" + Type<T>::name + "\033[0m"; }
+
+CT::Shape makeRandom2DShape() {
+    std::random_device rd;
+    std::mt19937 mt(rd());
+    std::uniform_int_distribution<uint32_t> dist(1, 15);
+    return CT::Shape({dist(mt), dist(mt)});
+}
+
+///////////
+// Tests //
+///////////
+
+class TestClass {
+    DEVICE_CLASS(TestClass);
+
+  public:
+    int x;
+    TestClass(const int x) : x(x) {
+        allocateDevice();
+        updateDevice().wait();
+    };
+};
+
+DEFINE_KERNEL(times, const CT::Array<int> arr) {
+    BASIC_LOOP(arr.shape().length()) { arr[iThread] *= 2; }
+}
+
+DEFINE_KERNEL(classTest, TestClass* const test) { test->x = 100; }
+
+struct MacroTests {
+    static uint32_t Kernel() {
+        uint32_t failed = 0;
+        CT::Array<int> A = CT::Array<int>::constant({10}, 1);
+        A.updateDevice().wait();
+        KERNEL(times, CT::Kernel::basic(A.shape().items()), A.view()).wait();
+        A.updateHost().wait();
+
+        uint32_t errors = 0;
+        for (auto it = A.begin(); it != A.end(); ++it) {
+            if (*it != 2) ++errors;
+        }
+
+        std::ostringstream msg;
+        msg << "Errors: " << errors;
+        TEST(errors == 0, "Kernel", msg.str().c_str());
+        return failed;
+    };
+
+    static uint32_t Class() {
+        uint32_t failed = 0;
+        TestClass test(1);
+        KERNEL(classTest, CT::Kernel::basic(1), test.that()).wait();
+        test.updateHost().wait();
+
+        TEST(test.x == 100, "Class", "Errors: 0");
+        return failed;
+    }
+};
+
+template <typename T> struct ArrayTests {
+    static uint32_t Indexing() {
+        uint32_t failed = 0;
+        CT::Array<T> A = CT::Array<T>::range(0, 240);
+        A.reshape({5, 3, 1, 4, 2, 1, 1, 2});
+
+        uint32_t errors = 0;
+        for (uint32_t i = 0; i < 5; ++i) {
+            for (uint32_t j = 0; j < 3; ++j) {
+                for (uint32_t k = 0; k < 4; ++k) {
+                    for (uint32_t l = 0; l < 2; ++l) {
+                        for (uint32_t m = 0; m < 2; ++m) {
+                            if ((T)A[i][j][0][k][l][0][0][m] != (T)A[{i, j, 0, k, l, 0, 0, m}]) {
+                                ++errors;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        std::ostringstream msg;
+        msg << "Errors: " << errors;
+        TEST(errors == 0, "Element", msg.str().c_str());
+
+        errors = 0;
+        CT::Array<T> ApartGroup_1 = A[{2, 2}];
+        CT::Array<T> ApartIndiv_1 = A[2][2];
+        for (uint32_t k = 0; k < 4; ++k) {
+            for (uint32_t l = 0; l < 2; ++l) {
+                for (uint32_t m = 0; m < 2; ++m) {
+                    if ((T)ApartIndiv_1[0][k][l][0][0][m] != (T)ApartGroup_1[{0, k, l, 0, 0, m}]) {
+                        ++errors;
+                    }
+                }
+            }
+        }
+
+        msg.str("");
+        msg << "Errors: " << errors;
+        TEST(errors == 0, "Axis (1/2)", msg.str().c_str());
+
+        errors = 0;
+        CT::Array<T> ApartGroup_2 = A[{3, 2, 0, 3}];
+        CT::Array<T> ApartIndiv_2 = A[3][2][0][3];
+
+        for (uint32_t l = 0; l < 2; ++l) {
+            for (uint32_t m = 0; m < 2; ++m) {
+                if ((T)ApartIndiv_2[l][0][0][m] != (T)ApartGroup_2[{l, 0, 0, m}]) {
+                    ++errors;
+                }
+            }
+        }
+
+        msg.str("");
+        msg << "Errors: " << errors;
+        TEST(errors == 0, "Axis (2/2)", msg.str().c_str());
+        return failed;
+    };
+
+    static uint32_t Slicing() {
+        uint32_t failed = 0;
+        CT::Array<T> A = CT::Array<T>::constant({4, 5, 5}, 0);
+
+        CT::Array<T> Aslice = A.slice({{0, 3}, {1, 4}, {1, 4}});
+        T num = (T)1;
+        for (auto it = Aslice.begin(); it != Aslice.end(); ++it) {
+            *it = num;
+            ++num;
+        }
+
+        CT::Array<T> Aslice2 = A[3].slice({{0, 5}, {0, 1}});
+        num = (T)-1;
+        for (auto it = Aslice2.begin(); it != Aslice2.end(); ++it) {
+            *it = num;
+            --num;
+        }
+
+        uint32_t errors = 0;
+        for (int i = 0; i < 3; ++i) {
+            for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                    if ((T)A[i][1 + j][1 + k] != (T)(9 * i + 3 * j + k + 1)) {
+                        ++errors;
+                    }
+                }
+            }
+        }
+        std::ostringstream msg;
+        msg << "Errors: " << errors;
+        TEST(errors == 0, "Block", msg.str().c_str());
+
+        errors = 0;
+        for (int i = 0; i < 5; ++i) {
+            if ((T)A[3][i][0] != (T)(-(i + 1))) {
+                ++errors;
+            }
+        }
+
+        msg.str("");
+        msg << "Errors: " << errors;
+        TEST(errors == 0, "Column", msg.str().c_str());
+        return failed;
+    }
+};
+
+template <typename T> struct BLASTests {
+    static double thres;
+    static uint32_t GEMV(int attempts) {
+        uint32_t failed = 0;
+        for (int i = 0; i < attempts; i++) {
+            CT::Shape Ashape = makeRandom2DShape();
+            CT::Shape xshape = CT::Shape({Ashape.cols(), 1});
+            CT::Shape yshape = CT::Shape({Ashape.rows(), 1});
+
+            CT::Array<T> A(Ashape);
+            CT::Array<T> x(xshape);
+            CT::Array<T> y(yshape);
+
+            A.setRandom(-100, 100);
+            x.setRandom(-100, 100);
+
+            A.updateDevice();
+            x.updateDevice().wait();
+
+            CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait();
+            y.updateHost().wait();
+
+            CT::Array<T> yTest(yshape, true);
+            yTest.eigenMap() = A.eigenMap() * x.eigenMap();
+
+            double norm = (y.eigenMap() - y.eigenMap()).norm();
+
+            std::ostringstream name;
+            name << "GEMV (" << i + 1 << "/" << attempts << ")";
+            std::ostringstream msg;
+            msg << "Matrix Shape: " << Ashape << ", "
+                << "Residual: " << norm;
+            TEST(norm < thres, name.str().c_str(), msg.str().c_str());
+        }
+        return failed;
+    };
+
+    static uint32_t GEMVBroadcast() {
+        uint32_t failed = 0;
+        CT::Shape Ashape = makeRandom2DShape();
+        CT::Shape xshape = CT::Shape({Ashape.cols(), 1});
+        CT::Shape yshape = CT::Shape({Ashape.rows(), 1});
+
+        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()});
+        CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()});
+        CT::Array<T> y({2, 3, yshape.rows(), yshape.cols()});
+
+        A.setRandom(-100, 100);
+        x.setRandom(-100, 100);
+
+        A.updateDevice();
+        x.updateDevice().wait();
+
+        CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait();
+        y.updateHost().wait();
+
+        double norm = 0;
+        CT::Array<T> yTest(yshape, true);
+        for (int i = 0; i < 2; ++i) {
+            for (int j = 0; j < 3; ++j) {
+                yTest.eigenMap() = A[i][j].eigenMap() * x[i][j].eigenMap();
+                norm += (yTest.eigenMap() - y[i][j].eigenMap()).norm();
+            }
+        }
+
+        std::ostringstream msg;
+        msg << "Matrix Shape: " << Ashape << ", "
+            << "Residual: " << norm;
+        TEST(norm < thres, "GEMV Broadcast", msg.str().c_str());
+        return failed;
+    };
+
+    static uint32_t GEMM(int attempts) {
+        uint32_t failed = 0;
+        for (int i = 0; i < attempts; i++) {
+            CT::Shape Ashape = makeRandom2DShape();
+            CT::Shape Bshape = makeRandom2DShape();
+            Bshape = CT::Shape({Ashape.cols(), Bshape.cols()});
+
+            CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()});
+
+            CT::Array<T> A(Ashape);
+            CT::Array<T> B(Bshape);
+            CT::Array<T> C(Cshape);
+
+            A.setRandom(-100, 100);
+            B.setRandom(-100, 100);
+            C.setRandom(-100, 100);
+
+            A.updateDevice();
+            B.updateDevice();
+            C.updateDevice().wait();
+
+            CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait();
+            C.updateHost().wait();
+
+            CT::Array<T> CTest(Cshape, true);
+            CTest.eigenMap() = A.eigenMap() * B.eigenMap();
+
+            double norm = (CTest.eigenMap() - C.eigenMap()).norm();
+
+            std::ostringstream name;
+            name << "GEMM (" << i + 1 << "/" << attempts << ")";
+            std::ostringstream msg;
+            msg << "Matrix Shapes: " << Ashape << Bshape << ", "
+                << "Residual: " << norm;
+            TEST(norm < thres, name.str().c_str(), msg.str().c_str());
+        }
+        return failed;
+    };
+
+    static uint32_t GEMMBroadcast() {
+        uint32_t failed = 0;
+        CT::Shape Ashape = makeRandom2DShape();
+        CT::Shape Bshape = makeRandom2DShape();
+        Bshape = CT::Shape({Ashape.cols(), Bshape.cols()});
+
+        CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()});
+
+        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()});
+        CT::Array<T> B({2, 3, Bshape.rows(), Bshape.cols()});
+        CT::Array<T> C({2, 3, Cshape.rows(), Cshape.cols()});
+
+        A.setRandom(-100, 100);
+        B.setRandom(-100, 100);
+
+        A.updateDevice();
+        B.updateDevice();
+        C.updateDevice().wait();
+
+        CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait();
+        C.updateHost().wait();
+
+        double norm = 0;
+        CT::Array<T> CTest(Cshape, true);
+        for (int i = 0; i < 2; ++i) {
+            for (int j = 0; j < 3; ++j) {
+                CTest.eigenMap() = A[i][j].eigenMap() * B[i][j].eigenMap();
+                norm += (CTest.eigenMap() - C[i][j].eigenMap()).norm();
+            }
+        }
+
+        std::ostringstream msg;
+        msg << "Matrix Shapes: " << Ashape << Bshape << ", "
+            << "Residual: " << norm;
+        TEST(norm < thres, "GEMM Broadcast", msg.str().c_str());
+        return failed;
+    };
+
+    static uint32_t PLU() {
+        uint32_t failed = 0;
+        CT::Shape Ashape = makeRandom2DShape();
+        CT::Shape xshape = makeRandom2DShape();
+        Ashape = CT::Shape({Ashape.rows(), Ashape.rows()});
+        xshape = CT::Shape({Ashape.rows(), xshape.cols()});
+
+        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.rows()});
+        CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()});
+        CT::Array<T> b({2, 3, xshape.rows(), xshape.cols()});
+        CT::Array<T> Ax({2, 3, xshape.rows(), xshape.cols()});
+
+        A.setRandom(-100, 100);
+        b.setRandom(-100, 100);
+
+        CT::Array<T> LU(A.copy());
+        x = b;
+
+        A.updateDevice();
+        LU.updateDevice();
+        x.updateDevice().wait();
+
+        CT::BLAS::PLUBatch<T> luBatch(LU);
+        CT::BLAS::Batch<T> xBatch(x);
+        luBatch.computeLU().wait();
+        luBatch.solve(xBatch).wait();
+
+        // Compute Ax and compare difference.
+        CT::BLAS::GEMM<T>(1.0, A, x, 0.0, Ax).wait();
+        Ax.updateHost();
+
+        double norm = 0;
+        for (int i = 0; i < 2; ++i) {
+            for (int j = 0; j < 3; ++j) {
+                norm += (Ax[i][j].eigenMap() - b[i][j].eigenMap()).norm();
+            }
+        }
+
+        std::ostringstream msg;
+        msg << "Matrix Shape: " << Ashape << xshape << ", "
+            << "Residual: " << norm;
+        TEST(norm < thres, "PLU/Solve", msg.str().c_str());
+        return failed;
+    }
+};
+
+template <> double BLASTests<float>::thres = 10e-1;
+template <> double BLASTests<double>::thres = 10e-8;
+
+uint32_t doMacroTests() {
+    uint32_t failed = 0;
+    failed += MacroTests::Kernel();
+    failed += MacroTests::Class();
+    std::cout << "\n";
+    return failed;
+}
+
+template <typename T> uint32_t doArrayTests() {
+    uint32_t failed = 0;
+    std::cout << boxSmall("Index Tests : " + type<T>()) << "\n";
+    failed += ArrayTests<T>::Indexing();
+    std::cout << "\n" << boxSmall("Slice Tests : " + type<T>()) << "\n";
+    failed += ArrayTests<T>::Slicing();
+    std::cout << "\n";
+    return failed;
+}
+
+template <typename T> uint32_t doBLASTests() {
+    uint32_t failed = 0;
+    std::cout << boxSmall("GEMV Tests : " + type<T>()) << "\n";
+    failed += BLASTests<T>::GEMV(5);
+    failed += BLASTests<T>::GEMVBroadcast();
+
+    std::cout << "\n" << boxSmall("GEMM Tests : " + type<T>()) << "\n";
+    failed += BLASTests<T>::GEMM(5);
+    failed += BLASTests<T>::GEMMBroadcast();
+
+    std::cout << "\n" << boxSmall("PLU Tests : " + type<T>()) << "\n";
+    failed += BLASTests<T>::PLU();
+    std::cout << "\n";
+    return failed;
+}
+
+int main() {
+    uint32_t failed = 0;
+    std::cout << box("Macro Tests") << "\n";
+    failed += doMacroTests();
+
+    std::cout << box("Array Tests") << "\n";
+    // Test different sizes.
+    failed += doArrayTests<uint8_t>();
+    failed += doArrayTests<int16_t>();
+    failed += doArrayTests<int32_t>();
+    failed += doArrayTests<double>();
+
+    std::cout << box("BLAS Tests") << "\n";
+    failed += doBLASTests<float>();
+    failed += doBLASTests<double>();
+
+    constexpr uint32_t tests = 2 + 4 * 5 + 13 * 2;
+    std::ostringstream msg;
+    msg << ((failed == 0) ? "\033[1;32mPASS \033[0m(" : "\033[1;31mFAIL \033[0m(")
+        << (tests - failed) << "/" << tests << ")";
+    std::cout << box2(msg.str()) << "\n";
+
+    return 0;
+}