Initial commit with first version of library

2 years ago · b4e4a49d44
commit b4e4a49d44
26 changed files with 6120 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,10 @@
 ---
 BasedOnStyle: LLVM
 IndentWidth: 4
 ColumnLimit: 100
 AllowShortIfStatementsOnASingleLine: true
 ---
 Language: Cpp
 DerivePointerAlignment: false
 PointerAlignment: Left
 ---
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 build
 *CPU
 *GPU
 .venv
--- a/Array.h
+++ b/Array.h
@ -0,0 +1,777 @@
 #ifndef ARRAY_H
 #define ARRAY_H
 #include "Core.h"
 #include "Macros.h"
 #include <Eigen/Dense>
 #include <iomanip>
 #include <math.h>
 #include <random>
 #include <type_traits>
 #ifdef DEVICE
 #define POINTER pDevice
 #else
 #define POINTER pHost
 #endif
 namespace CudaTools {
 template <typename T>
 using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
 template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
 template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;
 template <typename T> struct EigenAdaptConst { typedef EigenMapMat<T> type; };
 template <typename T> struct EigenAdaptConst<const T> { typedef ConstEigenMapMat<T> type; };
 #define ENABLE_IF(X) std::enable_if_t<X, bool>
 #define IS_INT(T) std::is_integral<T>::value
 #define IS_FLOAT(T) std::is_floating_point<T>::value
 #define IS_NUM(T) IS_INT(T) or IS_FLOAT(T)
 template <typename T> class Array;
 using Slice = std::pair<uint32_t, uint32_t>;
 template <typename T> class ArrayIterator {
  private:
    template <typename U>
    friend std::ostream& operator<<(std::ostream& out, const ArrayIterator<U>& it);
    T* pData;
    Shape mShape;
    uint32_t mIndices[CUDATOOLS_ARRAY_MAX_AXES] = {0};
  public:
    HD ArrayIterator(T* p, const Shape& shape) : pData(p), mShape(shape){};
    /**
     * Moves the iterator to the next value.
     */
    HD void next() {
        bool carry = false;
        uint32_t offset = 0;
        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) {
            if (mIndices[iAxis] == mShape.dim(iAxis) - 1) {
                mIndices[iAxis] = 0;
                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1);
                carry = true;
            } else {
                pData += mShape.stride(iAxis);
                mIndices[iAxis] += 1;
                carry = false;
            }
            if (not carry) {
                pData -= offset;
                return;
            }
        }
        pData += 1; // "Overflow" occured, so we reached end of array.
    }
    /**
     * Moves the iterator to the previous value.
     */
    HD void prev() {
        bool carry = false;
        uint32_t offset = 0;
        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) {
            if (mIndices[iAxis] == 0) {
                mIndices[iAxis] = mShape.dim(iAxis) - 1;
                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1);
                carry = true;
            } else {
                pData -= mShape.stride(iAxis);
                mIndices[iAxis] += 1;
                carry = false;
            }
            if (not carry) {
                pData += offset;
                return;
            }
        }
        pData -= 1;
    }
    /**
     * Moves the iterator a specified value away.
     * \param amount the amount to advance by
     */
    HD void advance(const int32_t amount) {
        if (amount < 0) {
            for (uint32_t i = 0; i < abs(amount); ++i) {
                prev();
            }
        } else {
            for (uint32_t i = 0; i < abs(amount); ++i) {
                next();
            }
        }
    }
    HD void operator++() { next(); }; /**< Prefix increment operator. */
    HD void operator--() { prev(); }; /**< Prefix decrement operator. */
    /**< Addition operator. */
    HD ArrayIterator<T> operator+(const int32_t v) const {
        ArrayIterator<T> it = *this;
        it.advance(v);
        return it;
    };
    /** Subtraction operator.*/
    HD ArrayIterator<T> operator-(const int32_t v) const {
        ArrayIterator<T> it = *this;
        it.advance(-v);
        return it;
    };
    HD void operator+=(const int32_t v) { advance(v); };
    HD void operator-=(const int32_t v) { advance(-v); };
    HD T& operator*() { return *pData; };             /**< Dereference operator. */
    HD const T& operator*() const { return *pData; }; /**< Const dereference operator. */
    /**
     * Equals operator.
     */
    HD bool operator==(const ArrayIterator<T>& it) { return pData == it.pData; }
    /**
     * Not equals operator.
     */
    HD bool operator!=(const ArrayIterator<T>& it) { return pData != it.pData; }
 };
 template <typename T> std::ostream& operator<<(std::ostream& out, const ArrayIterator<T>& it) {
    return out << it.pData;
 }
 template <typename T> class ArrayLoader {
  private:
    ArrayIterator<T> mIterator;
    ArrayIterator<T> mIteratorEnd;
  public:
    HD ArrayLoader(const ArrayIterator<T>& it, const ArrayIterator<T>& it_end)
        : mIterator(it), mIteratorEnd(it_end){};
    HD ArrayLoader &operator,(const T value) {
        CT_ERROR_IF(mIterator, ==, mIteratorEnd, "Cannot assign more values than Array size");
        *mIterator = value;
        ++mIterator;
        return *this;
    }
 };
 /**
 * A container that holds a N-dimensional array, stored column major. To set the
 * maximum N, there is a compiler macro CUDATOOLS_ARRAY_MAX_DIM whose default value is 4.
 * It adapts to operations between host and device to ease memory management.
 */
 template <typename T> class Array {
  private:
    template <typename U> friend std::ostream& operator<<(std::ostream&, const Array<U>&);
    Shape mShape;
    T* pHost = nullptr;
    T* pDevice = nullptr;
    bool mIsView = false;
    bool mIsSlice = false;
    uint32_t mEndOffset = 0;
    void freeArrays() {
 #ifndef DEVICE
        if (not mIsView) {
            if (pDevice != nullptr) CudaTools::free(pDevice);
            if (pHost != nullptr) delete[] pHost;
        }
 #endif
    };
    HD void calcEnd() {
        uint32_t offset = 0;
        for (uint32_t i = 0; i < shape().axes(); ++i) {
            offset += (shape().dim(i) - 1) * shape().stride(i);
        }
        mEndOffset = offset + 1;
    };
  public:
    HD Array() = default;
    /**
     * Constructor for an Array that creates an allocates an array with
     * the specified Shape. Construction in this format is disabled on the device.
     * \brief Host only
     * \param shape the shape of the array
     * \param noDevice whether to initialize the array on the device
     */
    Array(const Shape& shape, const bool noDevice = false) : mShape(shape), mIsView(false) {
        pHost = new T[shape.items()];
        calcEnd();
        if (noDevice) return;
        pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T));
    };
    /**
     * Constructor for an Array from an existing (preallocated) pointer.
     * \param pointer the pointer to use
     * \param shape the shape of the array
     * \param noDevice whether to initialize the array on the device
     */
    HD Array(T* const pointer, const Shape& shape, const bool noDevice = false)
        : mShape(shape), mIsView(true), mIsSlice(false) {
        POINTER = pointer;
        calcEnd();
 #ifndef DEVICE
        if (noDevice) return;
        pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T));
 #endif
    };
    /**
     * Constructor for making a Array view from another Array,
     * given an offset and shape.
     * \param arr the original Array
     * \param shape the shape of the new array
     * \param offset the index where to start the a view of the array
     */
    HD Array(const Array& arr, const Shape& shape, const uint32_t offset = 0)
        : mShape(shape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(true),
          mIsSlice(arr.mIsSlice) {
        calcEnd();
        if (pHost != nullptr) pHost += offset;
        if (pDevice != nullptr) pDevice += offset;
    };
    /**
     * The copy-constructor for a Array. If this is not a view, a deep copy
     * of the data will be performed on both host and device. On the device, it is always
     * treated like a view.
     */
    HD Array(const Array& arr) : mShape(arr.mShape), mIsView(arr.mIsView), mIsSlice(arr.mIsSlice) {
        calcEnd();
        if (mIsView) { // If the other array was a view (and now this one), just assign.
            pHost = arr.pHost;
            pDevice = arr.pDevice;
            return;
        }
        // Otherwise, we assume this is needs to own data.
        pHost = new T[mShape.items()];
        auto arr_it = arr.begin();
        for (auto it = begin(); it != end(); ++it) {
            *it = *arr_it;
            ++arr_it;
        }
 #ifndef DEVICE
        if (arr.pDevice != nullptr) {
            pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T));
        }
 #endif
    };
    /**
     *  The move-constructor for a Array.
     */
    HD Array(Array&& arr)
        : mShape(arr.mShape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(arr.mIsView),
          mIsSlice(arr.mIsSlice) {
        calcEnd();
        // Make other object empty.
        arr.pHost = nullptr;
        arr.pDevice = nullptr;
        arr.mIsView = true;
    };
    HD ~Array() { freeArrays(); };
    /**
     * The copy-assignment operator for a Array. If this is not a view,
     * then the currently owned data will be freed, and a deep copy of the data will
     * be performed on both host and device. On the device, it is always treated like a view.
     */
    HD Array& operator=(const Array& arr) {
        if (this == &arr) return *this;
        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
            auto arr_it = arr.begin();
            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) {
                *it = *arr_it;
                ++arr_it;
            }
            return *this;
        }
        // Otherwise, it is implied to be object reassignment.
        mShape = arr.mShape;
        mIsView = arr.mIsView;
        mIsSlice = arr.mIsSlice;
        calcEnd();
        // Regardless if the right-hand side is a view, we create a new copy.
        // In case that the right-hand side is a view of this array, we
        // allocate memory to copy first. Keep in mind that the right-hand side
        // array will then become undefined.
        // We can only do this on the host.
 #ifndef DEVICE
        T* new_pDevice = nullptr;
        if (pDevice != nullptr) {
            new_pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T));
        }
        T* new_pHost = new T[mShape.items()];
        memcpy(new_pHost, arr.pHost, mShape.items() * sizeof(T));
        freeArrays();
        pHost = new_pHost;
        pDevice = new_pDevice;
 #else
        pHost = arr.pHost;
        pDevice = arr.pDevice;
 #endif
        return *this;
    };
    /**
     * The move-assignment operator for a Array.
     */
    HD Array& operator=(Array&& arr) {
        if (this == &arr) return *this;
        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
            auto arr_it = arr.begin();
            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) {
                *it = *arr_it;
                ++arr_it;
            }
            return *this;
        }
        CT_ERROR(arr.mIsView,
                 "Cannot move-assign view to a non-view (owner). This would lead to undefined "
                 "behavior.");
        // Otherwise, it is implied to be object reassignment.
        freeArrays();
        mShape = arr.mShape;
        pHost = arr.pHost;
        pDevice = arr.pDevice;
        mIsView = arr.mIsView;
        mIsSlice = arr.mIsSlice;
        calcEnd();
        // Make other array empty.
        arr.pHost = nullptr;
        arr.pDevice = nullptr;
        arr.mIsView = true;
        return *this;
    };
    /**
     * Used for indexing the Array.
     * \param index index of the first dimension
     */
    HD Array operator[](const uint32_t index) const {
        CT_ERROR_IF(index, >=, shape().dim(0), "Index exceeds axis size");
        return Array(*this, shape().subshape(1), index * shape().stride(0));
    };
    /**
     * Used for indexing the Array.
     * \param indices a list of indices to index the Array
     */
    HD Array operator[](const std::initializer_list<uint32_t> indices) const {
        CT_ERROR_IF(indices.size(), >, shape().axes(),
                    "Number of indices cannot exceed number of axes");
        auto it = indices.begin();
        uint offset = 0;
        for (uint32_t i = 0; i < indices.size(); ++i) {
            uint32_t index = *it;
            CT_ERROR_IF(index, >=, shape().dim(i), "Index exceeds axis size");
            offset += index * shape().stride(i);
            ++it;
        }
        return Array(*this, shape().subshape(indices.size()), offset);
    };
    HD ArrayLoader<T> operator<<(const T value) {
        auto it = begin();
        *it = value;
        ++it;
        return ArrayLoader<T>(it, end());
    };
    HD T operator=(const T& value) { return POINTER[0] = value; };
    HD operator T&() { return POINTER[0]; };
    HD operator const T&() const { return POINTER[0]; };
    /**
     * Used to create slices of the Array.
     * \param slices a list of slices to slice the Array
     */
    HD Array slice(const std::initializer_list<Slice> slices) const {
        CT_ERROR_IF(slices.size(), >, shape().axes(),
                    "Number of slices cannot exceed number of axes");
        uint offset = 0;
        Shape new_shape = mShape;
        auto it = slices.begin();
        for (uint32_t i = 0; i < slices.size(); ++i) {
            uint32_t from_index = it->first;
            uint32_t to_index = it->second;
            CT_ERROR_IF(from_index, >, to_index,
                        "Slice start cannot be greater than than slice end");
            CT_ERROR_IF(from_index, >=, shape().dim(i), "Slice start exceeds axis size");
            CT_ERROR_IF(to_index - 1, >=, shape().dim(i), "Slice end exceeds axis size");
            offset += from_index * shape().stride(i);
            new_shape.mAxisDim[i] = to_index - from_index;
            ++it;
        }
        new_shape.mItems = 1;
        for (uint32_t i = 0; i < shape().axes(); ++i) {
            new_shape.mItems *= new_shape.dim(i);
        }
        Array<T> arr(*this, new_shape, offset);
        arr.mIsSlice = true;
        return arr;
    };
    /**
     * Returns this Array with a different Shape. Its self assigning version is reshape.
     * If this Array is a slice of another, then it will perform a deep copy, and return
     * a new non-view array.
     */
    HD Array reshaped(const Shape& new_shape) const {
        CT_ERROR_IF(shape().items(), !=, new_shape.items(),
                    "New shape cannot have a different number of terms");
        if (mIsSlice) {
            Array<T> arr = this->copy();
            return arr.reshaped(new_shape);
        }
        Array<T> arr = view();
        arr.mShape = new_shape;
        return arr;
    };
    HD void reshape(const Shape& new_shape) {
        CT_ERROR_IF(shape().items(), !=, new_shape.items(),
                    "New shape cannot have a different number of terms");
        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try reshaped instead)")
        mShape = new_shape;
    };
    /**
     * Gets a view that is has at least two dimensions. Useful for promoting
     * single vectors to their 2D counterparts.
     */
    HD Array atLeast2D() const {
        return (shape().axes() == 1) ? Array(*this, {shape().length(), 1}) : view();
    };
    /**
     * Flattens the Array into one dimension.
     */
    HD Array flatten() const { return reshape({mShape.mItems}); };
    /**
     * Returns the Eigen::Map of this Array.
     */
    typename EigenAdaptConst<T>::type eigenMap() const {
        uint32_t total_dim = mShape.mAxes;
        CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices")
        CT_ERROR_IF(total_dim, !=, 2,
                    "Mapping to an Eigen array can only occur on two-dimensional arrays");
        return typename EigenAdaptConst<T>::type(POINTER, mShape.rows(), mShape.cols());
    };
    /**
     * Gets the Shape of the Array.
     */
    HD Shape shape() const { return mShape; };
    /**
     * Gets the pointer to this array, depending on host or device.
     */
    HD T* data() const { return POINTER; };
    /**
     * Returns the device pointer regardless of host or device.
     */
    HD T* dataDevice() const { return pDevice; };
    HD bool isView() const { return mIsView; };   /**< Gets whether this Array is a view. */
    HD bool isSlice() const { return mIsSlice; }; /**< Gets whether this Array is a slice. */
    /**
     * Gets a view of this Array.
     */
    HD Array view() const { return Array(*this, mShape); }
    /**
     * Copies this Array and returns a new Array with the same memory.
     */
    HD Array copy() const {
        Array<T> arr(mShape, (pDevice == nullptr));
        auto arr_it = arr.begin();
        for (auto it = begin(); it != end(); ++it) {
            *arr_it = *it;
            ++arr_it;
        }
 #ifndef DEVICE
        if (pDevice != nullptr) {
            CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
        }
 #endif
        return arr;
    };
    /**
     * Gets the iterator to the beginning of this Array.
     */
    HD ArrayIterator<T> begin() const { return ArrayIterator<T>(POINTER, mShape); };
    /**
     * Gets the iterator to the end of this Array.
     */
    HD ArrayIterator<T> end() const { return ArrayIterator<T>(POINTER + mEndOffset, mShape); };
    /**
     * Sets the values of the entire Array to a constant. This is restricted to numerical types.
     */
    HD void setConstant(const T value) const {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = value;
        }
    };
    /**
     * Sets the Array values with uniform random values in a specified range. This is restricted to
     * numerical types.
     * \brief Host only
     */
    void setRandom(const T min, const T max) const {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        std::random_device rd;
        std::mt19937 mt(rd());
        if constexpr (IS_INT(T)) {
            std::uniform_int_distribution<T> dist(min, max);
            for (auto it = begin(); it != end(); ++it) {
                *it = dist(mt);
            }
        } else if constexpr (IS_FLOAT(T)) {
            std::uniform_real_distribution<T> dist(min, max);
            for (auto it = begin(); it != end(); ++it) {
                *it = dist(mt);
            }
        }
    };
    /**
     * Sets the Array values to start from a value and increment by a specified step. This is
     * restricted to numerical types.
     */
    HD void setRange(T min, const T step = 1) const {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = min;
            min += step;
        }
    }
    /**
     * Sets the Array values to be evenly spaced numbers over a given interval. This is restricted
     * to floating point types.
     */
    HD void setLinspace(const T min, const T max) const {
        static_assert(IS_FLOAT(T), "Function only available on numeric floating types.");
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        T i = 0;
        T d = max - min;
        T items = (T)(shape().items() - 1);
        for (auto it = begin(); it != end(); ++it) {
            *it = min + d * (i / items);
            i += 1;
        }
    };
    /**
     * Returns array of given shape with constant values. This is restricted to numerical types.
     * \brief Host only
     */
    static Array constant(const Shape& shape, const T value) {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        Array<T> arr(shape);
        arr.setConstant(value);
        return arr;
    };
    /**
     * Returns array of given shape with random values in given interval. This is restricted to
     * numerical types.
     * \brief Host only
     */
    static Array random(const Shape& shape, const T min, const T max) {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        Array<T> arr(shape);
        arr.setRandom(min, max);
        return arr;
    };
    /**
     * Returns evenly spaced values within a given interval. This is restricted to numerical types.
     * \brief Host only
     */
    static Array range(const T min, const T max, const T step = 1) {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        Array<T> arr({(uint32_t)((max - min) / step)});
        arr.setRange(min, step);
        return arr;
    }
    /**
     * Returns evenly spaced values within a given interval. This is restricted to floating point
     * types.
     * \brief Host only
     */
    static Array linspace(const T min, const T max, const uint32_t size) {
        static_assert(IS_FLOAT(T), "Function only available on numeric floating types.");
        Array<T> arr({size});
        arr.setLinspace(min, max);
        return arr;
    }
    /**
     * Transposes the internal data and returns the corresponding new Array.
     * Its self assigning version is transpose. This is restricted to numerical types.
     * \brief Host only
     */
    Array transposed() const {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr({mShape.rows(), mShape.cols()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
        return new_arr;
    };
    /**
     * Transposes the intenal data. Its self assigning version is transpose.
     * This is restricted to numerical types.
     * \brief Host only
     */
    void transpose() {
        static_assert(IS_NUM(T), "Function only available on numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
        mShape = Shape({mShape.cols(), mShape.rows()});
    };
    void inverse() const {
        static_assert(IS_FLOAT(T), "Function only available on floating numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays");
        CT_ERROR_IF(shape().rows(), !=, shape().cols(),
                    "Inverse can only occur on square matrices");
        Array<T> inv(shape());
        inv.eigenMap() = this->eigenMap().inverse();
    };
    /**
     * Pins the memory (page locks) for faster memory transfer in concurrent
     * transfers.
     * \brief Host only
     */
    void pinMemory() const { CudaTools::pin(pHost, mShape.items() * sizeof(T)); };
    /**
     * Updates the host copy by copying the device data back to the host.
     * \brief Host only
     */
    StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
        CT_ERROR(mIsView, "Cannot update host on a view");
        CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream);
        return stream;
    };
    /**
     * Updates the device copy by copying the host data to the device.
     * \brief Host only
     */
    StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
        CT_ERROR(mIsView, "Cannot update device on a view");
        CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream);
        return stream;
    };
 };
 template <typename T>
 void printAxis(std::ostream& out, const Array<T>& arr, const uint32_t axis, size_t width) {
    std::string space = std::string(2 * axis, ' ');
    if (arr.shape().axes() == 1) {
        out << "[";
        for (uint32_t i = 0; i < arr.shape().items(); ++i) {
            if constexpr (std::is_floating_point<T>::value) {
                out << std::scientific << std::setprecision(6);
            }
            if (width == 0) {
                out << ((i == 0) ? "" : " ");
            } else {
                out << std::setw((i == 0) ? width - 1 : width);
            }
            out << (T)arr[i] << ((i == arr.shape().items() - 1) ? "]" : ",");
        }
    } else if (arr.shape().axes() == 2) {
        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) {
            out << space << ((i == 0) ? "[" : " ");
            printAxis(out, arr[i], axis + 1, width);
            out << ((i == arr.shape().dim(0) - 1) ? "]" : ",\n");
        }
    } else {
        out << space << "[\n";
        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) {
            printAxis(out, arr[i], axis + 1, width);
            out << ((i == arr.shape().dim(0) - 1) ? "\n" : ",\n\n");
        }
        out << space << "]";
    }
 }
 template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>& arr) {
    size_t width = 0;
    if constexpr (IS_NUM(T)) {
        T max_val = 0;
        bool negative = false;
        for (auto it = arr.begin(); it != arr.end(); ++it) {
            if (*it < 0) negative = true;
            max_val = (abs(*it) > max_val) ? abs(*it) : max_val;
        }
        width = std::to_string(max_val).size() + 1;
        width += (negative) ? 1 : 0;
    } else if constexpr (IS_FLOAT(T)) {
        T max_val = 0;
        bool negative = false;
        for (auto it = arr.begin(); it != arr.end(); ++it) {
            if (*it < 0) negative = true;
            int exp = 0;
            frexp(*it, &exp);
            max_val = (exp > max_val) ? exp : max_val;
        }
        width = std::to_string(max_val).size() + 5;
        width += (negative) ? 1 : 0;
    }
    printAxis<T>(out, arr, 0, (arr.shape().axes() == 1) ? 0 : width);
    return out;
 }
 }; // namespace CudaTools
 #endif // ARRAY_H
--- a/BLAS.h
+++ b/BLAS.h
@ -0,0 +1,600 @@
 #ifndef BLAS_H
 #define BLAS_H
 #include "Array.h"
 #include "Core.h"
 #include "Macros.h"
 namespace CudaTools {
 namespace BLAS {
 struct BatchInfo {
    uint32_t strideA, strideB, strideC;
    uint32_t size;
 };
 template <typename T> struct Check {
    static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
        CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
    };
    static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
        isAtLeast2D(arr, name);
        CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
    };
    static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
                              const std::string& nameA = "A", const std::string& nameB = "B",
                              const std::string nameC = "C") {
        isAtLeast2D(A, nameA);
        isAtLeast2D(B, nameB);
        isAtLeast2D(C, nameB);
        CT_ERROR_IF(A.shape().cols(), !=, B.shape().rows(),
                    (nameA + nameB + " is not a valid matrix multiplication").c_str());
        Shape ABshape({A.shape().rows(), B.shape().cols()});
        Shape Cshape({C.shape().rows(), C.shape().cols()});
        CT_ERROR_IF(
            ABshape, !=, Cshape,
            ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
    };
    static uint32_t getUpperItems(const Array<T>& arr) {
        uint32_t upperItems = 1;
        for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
            upperItems *= arr.shape().dim(iAxis);
        }
        return upperItems;
    };
    static void matchUpperShape(const Array<T>& A, const Array<T>& B,
                                const std::string& nameA = "A", const std::string& nameB = "B") {
        CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
                    (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
        for (uint32_t iAxis = 0; iAxis < A.shape().axes() - 2; ++iAxis) {
            uint32_t Adim = A.shape().dim(iAxis);
            uint32_t Bdim = B.shape().dim(iAxis);
            CT_ERROR_IF(
                Adim, !=, Bdim,
                (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
        }
    };
    static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
                                     const std::string& nameA = "A", const std::string& nameB = "B",
                                     const std::string nameC = "C") {
        isValidMatmul(A, B, C, nameA, nameB, nameC);
        uint32_t itemsA = getUpperItems(A);
        uint32_t itemsB = getUpperItems(B);
        uint32_t itemsC = getUpperItems(C);
        uint32_t Asize = A.shape().rows() * A.shape().cols();
        uint32_t Bsize = B.shape().rows() * B.shape().cols();
        uint32_t Csize = C.shape().rows() * C.shape().cols();
        if (itemsA == itemsB) {
            CT_ERROR_IF(itemsA, !=, itemsC,
                        ("Incorrect dimensions to broadcast to output " + nameC).c_str());
            matchUpperShape(A, B, nameA, nameB);
            matchUpperShape(A, C, nameA, nameC);
            return BatchInfo{Asize, Bsize, Csize, itemsC};
        } else if (itemsA > itemsB) {
            CT_ERROR_IF(
                itemsB, !=, 1,
                ("Cannot broadcast operation to " + nameB + " with non-matching " + nameA).c_str());
            CT_ERROR_IF(itemsA, !=, itemsC,
                        ("Incorrect dimensions to broadcast to output " + nameC).c_str());
            matchUpperShape(A, C, nameA, nameC);
            return BatchInfo{Asize, 0, Csize, itemsC};
        } else {
            CT_ERROR_IF(
                itemsA, !=, 1,
                ("Cannot broadcast operation to " + nameA + " with non-matching " + nameB).c_str());
            CT_ERROR_IF(itemsA, !=, itemsC,
                        ("Incorrect dimensions to broadcast to output " + nameC).c_str());
            matchUpperShape(B, C, nameB, nameC);
            return BatchInfo{0, Bsize, Csize, itemsC};
        }
    };
 };
 /**
 * Represents a Batch of Arrays with the same shape. Mainly used for cuBLAS functions.
 */
 template <typename T> class Batch {
  protected:
    Array<T*> mBatch;
    Shape mShape;
    uint32_t mCount = 0;
    uint32_t mBatchSize;
  public:
    Batch() = delete;
    /**
     * Constructs a batch from a given size.
     */
    Batch(const uint32_t size) : mBatchSize(size){};
    /**
     * Constructs a batch from a non-view Array.
     */
    Batch(const Array<T>& arr) {
        CT_ERROR(arr.isView(), "Array cannot be a view");
        mShape = Shape({arr.shape().rows(), arr.shape().cols()});
        mBatchSize = mCount = Check<T>::getUpperItems(arr);
        mBatch = Array<T*>({mBatchSize});
        Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()});
        for (uint32_t i = 0; i < mBatchSize; ++i) {
 #ifdef CUDA
            mBatch[i] = batch[i].dataDevice();
 #else
            mBatch[i] = batch[i].data();
 #endif
        }
        mBatch.updateDevice().wait();
    };
    /**
     * Adds a matrix to the batch. Array must be a view.
     */
    void add(const Array<T>& arr) {
        CT_ERROR(not arr.isView(), "Cannot add non-view Arrays");
        CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays");
 #ifdef CUDA
        mBatch[mCount] = arr.dataDevice();
 #else
        mBatch[mCount] = arr.data();
 #endif
        if (mCount == 0) {
            mShape = arr.shape();
            mBatchSize = mCount = Check<T>::getUpperItems(arr);
        } else {
            CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
        }
        ++mCount;
        if (mCount == mBatchSize) {
            mBatch.updateDevice().wait();
        }
    };
    /**
     * Indexing operator which returns a view of the Array in the Batch at the given index.
     */
    Array<T> operator[](const uint32_t index) const {
        CT_ERROR_IF(index, >=, mBatchSize, "Index exceeds batch size");
        return Array<T>(mBatch[index], {mShape.rows(), mShape.cols()});
    };
    /**
     * Returns the batch Array of pointers.
     */
    Array<T*> batch() const { return mBatch.view(); };
    Shape shape() const { return mShape; } /**< Gets the shape of the matrices in the batch. */
    uint32_t size() const { return mBatchSize; }        /**< Gets the batch size.*/
    bool full() const { return mBatchSize == mCount; }; /**< Gets if the batch is full. */
 };
 ////////////////
 // cuBLAS API //
 ////////////////
 template <typename T, typename F1, typename F2, typename... Args>
 constexpr void invoke(F1 f1, F2 f2, Args&&... args) {
    if constexpr (std::is_same<T, float>::value) {
        CUBLAS_CHECK(f1(args...));
    } else if constexpr (std::is_same<T, double>::value) {
        CUBLAS_CHECK(f2(args...));
    } else {
        CT_ERROR(true, "BLAS functions are not callable with that type");
    }
 }
 /**
 * Computes the matrix-vector product: \f$ y = \alpha Ax + \beta y \f$. It will automatically
 * broadcast the operation if applicable.
 */
 template <typename T>
 StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
              const StreamID& stream = DEF_CUBLAS_STREAM) {
    BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
    CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
    CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
    uint32_t rows = A.shape().rows();
    uint32_t cols = A.shape().cols();
    T a = alpha, b = beta;
 #ifdef CUDA
    CUBLAS_CHECK(
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
    if (bi.size == 1) {
        invoke<T>(cublasSgemv, cublasDgemv, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols,
                  &a, A.dataDevice(), rows, x.dataDevice(), 1, &b, y.dataDevice(), 1);
    } else { // Greater than 2, so broadcast.
        invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched,
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, &a, A.dataDevice(), rows,
                  bi.strideA, x.dataDevice(), 1, bi.strideB, &b, y.dataDevice(), 1, bi.strideC,
                  bi.size);
    }
 #else
    if (bi.size == 1) {
        y.eigenMap() = a * (A.eigenMap() * x.eigenMap()) + b * y.eigenMap();
    } else { // Greater than 2, so broadcast.
 #pragma omp parallel for
        for (uint32_t i = 0; i < bi.size; ++i) {
            auto Ai = Array<T>(A, {rows, cols}, i * bi.strideA).eigenMap();
            auto xi = Array<T>(x, {cols, 1}, i * bi.strideB).eigenMap();
            auto yi = Array<T>(y, {rows, 1}, i * bi.strideC).eigenMap();
            yi = a * (Ai * xi) + b * yi;
        }
    }
 #endif
    return StreamID{stream};
 }
 /**
 * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
 * broadcast the operation if applicable.
 */
 template <typename T>
 StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
              const StreamID& stream = DEF_CUBLAS_STREAM) {
    BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
    // A is m x k, B is k x n.
    uint32_t m = A.shape().rows();
    uint32_t k = A.shape().cols();
    uint32_t n = B.shape().cols();
    T a = alpha, b = beta;
 #ifdef CUDA
    CUBLAS_CHECK(
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
    if (bi.size == 1) {
        invoke<T>(cublasSgemm, cublasDgemm, Manager::get()->cublasHandle(), CUBLAS_OP_N,
                  CUBLAS_OP_N, m, n, k, &a, A.dataDevice(), m, B.dataDevice(), k, &b,
                  C.dataDevice(), m);
    } else { // Greater than 2, so broadcast.
        invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched,
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &a,
                  A.dataDevice(), m, bi.strideA, B.dataDevice(), k, bi.strideB, &b, C.dataDevice(),
                  m, bi.strideC, bi.size);
    }
 #else
    if (bi.size == 1) {
        C.eigenMap() = a * (A.eigenMap() * B.eigenMap()) + b * C.eigenMap();
    } else { // Greater than 2, so broadcast.
 #pragma omp parallel for
        for (uint32_t i = 0; i < bi.size; ++i) {
            auto Ai = Array<T>(A, {m, k}, i * bi.strideA).eigenMap();
            auto Bi = Array<T>(B, {k, n}, i * bi.strideB).eigenMap();
            auto Ci = Array<T>(C, {m, n}, i * bi.strideC).eigenMap();
            Ci = a * (Ai * Bi) + b * Ci;
        }
    }
 #endif
    return StreamID{stream};
 }
 /**
 * Computes the diagonal matrix multiplication: \f$ C = A\mathrm{diag}(X) \f$, or \f$ C =
 * \mathrm{diag}(X)A \f$ if left = true.
 */
 template <typename T>
 StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const bool left = false,
              const StreamID& stream = DEF_CUBLAS_STREAM) {
    CT_ERROR_IF(X.shape().cols(), !=, 1, "'x' must be a column vector.");
    if (left) {
        CT_ERROR_IF(A.shape().rows(), !=, X.shape().rows(),
                    "Rows of 'A' and length of 'x' need to match.");
    } else {
        CT_ERROR_IF(A.shape().cols(), !=, X.shape().rows(),
                    "Columns of 'A' and length of 'x' need to match.");
    }
    CT_ERROR_IF(A.shape().rows(), !=, C.shape().rows(),
                "Rows of 'A' and rows() of 'C' need to  match.");
    CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(),
                "Rows of 'A' and columns of 'C' need to match.");
 #ifdef CUDA
    uint32_t m = C.shape().rows();
    uint32_t n = C.shape().cols();
    auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
    CUBLAS_CHECK(
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
    invoke<T>(cublasSdgmm, cublasDdgmm, Manager::get()->cublasHandle(), m, n, A.dataDevice(),
              A.shape().rows(), X.dataDevice(), 1, C.dataDevice(), m);
 #else
    if (left) {
        C.eigenMap() = X.eigenMap().asDiagonal() * A.eigenMap();
    } else {
        C.eigenMap() = A.eigenMap() * X.eigenMap().asDiagonal();
    }
 #endif
    return StreamID{stream};
 }
 //////////////////////////////
 // PLUArray Related Objects //
 //////////////////////////////
 ///////////////////////////
 // PartialPivLU Wrapper  //
 ///////////////////////////
 // This class is just a workaround to use Eigen's internals directly.
 template <typename T> class PartialPivLU;
 namespace internal {
 template <typename T> static Array<T> empty({1, 1});
 template <typename T> static EigenMapMat<T> empty_map = empty<T>.eigenMap();
 }; // namespace internal
 template <typename T, ENABLE_IF(IS_FLOAT(T)) = true> class PLUArray;
 // This is a wrapper class for Eigen's class so we have more controlled access to
 // the underlying data.
 template <typename T> class PartialPivLU : public Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>> {
  private:
    using Base = Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>>;
    template <typename U, ENABLE_IF(IS_FLOAT(U))> friend class PLUArray;
    EigenMapMat<T> mMapLU;
    EigenMapMat<int32_t> mMapPivots;
  public:
    PartialPivLU()
        : Base(internal::empty_map<T>), mMapLU(internal::empty_map<T>),
          mMapPivots(internal::empty_map<int32_t>){};
    void make(const Array<T>& lu, const Array<int32_t>& pivots) {
        new (&mMapLU) EigenMapMat<T>(lu.eigenMap());
        new (&mMapPivots) EigenMapMat<int32_t>(pivots.atLeast2D().eigenMap());
        new (&this->m_lu) decltype(Base::m_lu)(mMapLU.derived());
        new (&this->m_p) decltype(Base::m_p)(mMapPivots.derived());
        // new (&this->m_rowsTranspositions) decltype(Base::m_rowsTranspositions)(
        //     mMapPivots.derived());
        this->m_l1_norm = 0;
        this->m_det_p = 0;
        this->m_isInitialized = true;
    };
 };
 namespace internal {
 // We only create one and copy-construct to avoid the re-initialization.
 template <typename T> static PartialPivLU<T> BlankPPLU = PartialPivLU<T>();
 }; // namespace internal
 /**
 * Class for storing the PLU decomposition an Array. This is restricted to floating point types.
 */
 template <typename T, ENABLE_IF(IS_FLOAT(T))> class PLUArray {
  private:
    Array<T> mLU;
    Array<int32_t> mPivots;
    PartialPivLU<T> mPPLU = internal::BlankPPLU<T>;
  public:
    PLUArray() = delete;
    /**
     * Constructor for a PLUArray given the matrix dimension.
     */
    PLUArray(const uint32_t n) : mLU({n, n}), mPivots({n}) { mPPLU.make(mLU, mPivots); };
    /**
     * Constructor for a PLUArray given an existing array.
     */
    PLUArray(const Array<T>& arr)
        : mLU((arr.isView()) ? arr.view() : arr), mPivots({arr.shape().rows()}) {
        CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix");
        CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square");
        mPPLU.make(mLU, mPivots);
    };
    /**
     * Constructor for a PLUArray given an existing location in memory for both the matrix and
     * the pivots.
     */
    PLUArray(const Array<T>& arr, const Array<int32_t> pivots)
        : mLU(arr.view()), mPivots(pivots.view()) {
        CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix");
        CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square");
        mPPLU.make(mLU, mPivots);
    };
    uint32_t rank() { return mLU.shape().rows(); }; /**< Gets the rank of the LU matrix. */
    Array<T> LU() const { return mLU.view(); };     /**< Gets the LU matrix. */
    Array<int32_t> pivots() const { return mPivots.view(); }; /**< Gets the LU matrix. */
    /**
     * Comptues the inplace LU factorization for this array on CPU.
     */
    void computeLU() {
        mPPLU.compute();
        mPPLU.mMapPivots = mPPLU.permutationP().indices();
    };
    /**
     * Solves the system \f$ LUx = b \f$ and returns \f$x\f$.
     */
    Array<T> solve(const Array<T>& b) {
        Array<T> x(b.shape());
        x.eigenMap() = mPPLU.solve(b.eigenMap());
        return x;
    };
 };
 /**
 * This is a batch version of PLUArray, to enable usage of the cuBLAS API. This is restricted to
 * floating point types.
 */
 template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
 class PLUBatch : public Batch<T> {
  private:
    Array<int32_t> mPivotsBatch;
    Array<int32_t> mInfoLU;
    int32_t mInfoSolve;
    bool mInitialized = false;
  public:
    /**
     * Constructor of a PLUBatch from a given batch size.
     */
    PLUBatch(const uint32_t size) : Batch<T>(size), mInfoLU({size}){};
    /**
     * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
     */
    PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
        Check<T>::isSquare(arr, "LU Array");
        mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
        mInfoLU = Array<int32_t>({this->mBatchSize});
    };
    /**
     * Indexing operator which returns the PLUArray in the PLUBatch at the given index.
     */
    PLUArray<T> operator[](const uint32_t index) const {
        CT_ERROR_IF(index, >=, this->mBatchSize, "Index exceeds batch size");
        Array<T> lu(this->mBatch[index], {this->mShape.rows(), this->mShape.cols()});
        Array<int32_t> pivots(mPivotsBatch.data() + index * this->mShape.rows(),
                              {this->mShape.rows()});
        return PLUArray<T>(lu, pivots);
    };
    /**
     * Computes the inplace PLU decomposition of batch of arrays.
     */
    StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) {
 #ifdef CUDA
        uint32_t n = this->mShape.rows();
        CUBLAS_CHECK(
            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
        invoke<T>(cublasSgetrfBatched, cublasDgetrfBatched, Manager::get()->cublasHandle(), n,
                  this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), mInfoLU.dataDevice(),
                  this->mBatchSize);
 #else
 #pragma omp parallel for
        for (uint32_t i = 0; i < this->mBatchSize; ++i) {
            (*this)[i].computeLU();
        }
 #endif
        mInitialized = true;
        return stream;
    };
    /**
     * Solves the batched system \f$LUx = b\f$ inplace. The solution \f$x\f$ is written back into
     * \f$b\f$.
     */
    StreamID solve(const Batch<T>& b, const StreamID& stream = DEF_CUBLAS_STREAM) {
        CT_ERROR(not mInitialized,
                 "Cannot solve system if PLUBatch has not yet computed its LU decomposition");
        CT_ERROR_IF(b.size(), !=, this->mBatchSize,
                    "Upper dimensions of b do not match batch size");
        CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(),
                    "The length of each column of b must match the matrix rank");
 #ifdef CUDA
        uint32_t n = b.shape().rows();
        uint32_t nrhs = b.shape().cols();
        CUBLAS_CHECK(
            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
        invoke<T>(cublasSgetrsBatched, cublasDgetrsBatched, Manager::get()->cublasHandle(),
                  CUBLAS_OP_N, n, nrhs, this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(),
                  b.batch().dataDevice(), n, &mInfoSolve, this->mBatchSize);
 #else
 #pragma omp parallel for
        for (uint32_t i = 0; i < this->mBatchSize; ++i) {
            b[i] = (*this)[i].solve(b[i]);
        }
 #endif
        return stream;
    };
    /**
     * Gets the pivots data from the device to the host. Does nothing for CPU.
     */
    StreamID getPivots(const StreamID& stream = DEF_MEM_STREAM) const {
        mPivotsBatch.updateHost(stream);
        return stream;
    };
    /**
     * Gets the info array for the LU decomposition for the device to the host. Does not
     * return useful information for CPU.
     */
    Array<int32_t> getLUInfo() const {
        mInfoLU.updateHost().wait();
        return mInfoLU;
    };
    /**
     * Checks validity of the solve operation. Does not return useful information for CPU.
     */
    int32_t validSolve() const { return mInfoSolve == 0; }
 };
 // /**
 //  * Gets the inverse of each A[i], using an already PLU factorized A[i].
 //  * Only available if compiling with CUDA.
 //  */
 // template <typename T>
 // void inverseBatch(const Array<T*>& batchA, const Array<T*>& batchC, const Array<int>&
 // pivots,
 //                   const Array<int>& info, const Shape shapeA, const Shape shapeC,
 //                   const uint stream = 0) {
 // #ifdef CUDA
 //     CT_ERROR_IF(shapeA.rows(), !=, shapeA.cols(),
 //           "'A' needs to be square, rows() and column need to match.");
 //     CT_ERROR_IF(shapeA.rows(), !=, shapeC.cols(), "'A' needs to be the same shape as
 //     'C'."); CT_ERROR_IF(shapeA.rows(), !=, shapeC.rows(), "'A' needs to be the same shape
 //     as 'C'.");
 //     CT_ERROR_IF(shapeA.rows(), !=, pivots.shape().rows(),
 //           "Rows()/columns of 'A' and rows() of pivots need to match.");
 //     CT_ERROR_IF(batchA.shape().rows(), !=, pivots.shape().cols(),
 //           "Batch size and columns of pivots need to match.");
 //     CT_ERROR_IF(info.shape().cols(), !=, 1, "Info needs to be a column vector.")
 //     CT_ERROR_IF(batchA.shape().rows(), !=, info.shape().rows(),
 //           "Batch size and length of info need to match.");
 //     CT_ERROR_IF(batchA.shape().rows(), !=, batchC.shape().rows(),
 //           "Batches 'A[i]' and 'C[i]' need to match.");
 //     std::string s = "cublas" + std::to_string(stream);
 //     CUBLAS_CHECK(
 //         cublasSetStream(Manager::get()->cublasHandle(),
 //         Manager::get()->stream(s)));
 //     invoke<T>(cublasSgetriBatched, cublasDgetriBatched,
 //     Manager::get()->cublasHandle(),
 //               shapeA.rows(), batchA.dataDevice(), shapeA.rows(), pivots.dataDevice(),
 //               batchC.dataDevice(), shapeC.rows(), info.dataDevice(),
 //               batchA.shape().rows());
 // #else
 //     CT_ERROR_IF(true, ==, true, "inverseBatch is not callable without CUDA.");
 // #endif
 // }
 }; // namespace BLAS
 }; // namespace CudaTools
 #endif
--- a/Core.h
+++ b/Core.h
@ -0,0 +1,544 @@
 #ifndef CUDATOOLS_H
 #define CUDATOOLS_H
 #include "Macros.h"
 #include <iostream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 namespace CudaTools {
 /**
 * Simple wrapper for the name of a stream. Its purposes is to allow for
 * 'streams' to be passed on host code, and allowing for simple syntax
 * for waiting.
 */
 struct StreamID {
  public:
    std::string id;
    StreamID() : id(""){};
    /**
     * The constructor for a StreamID.
     */
    StreamID(const std::string& id_) : id(id_){};
    StreamID(const char* id_) : id(id_){};
    /**
     * Waits for the stream with this stream ID.
     */
    void wait() const;
 };
 static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
 static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
 static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
 /**
 * Allocates memory on the device.
 */
 void* malloc(const size_t size);
 /**
 * Pins memory on the host.
 */
 void pin(void* const pHost, const size_t size);
 /**
 * Pushes memory from the device to the host.
 */
 StreamID push(void* const pHost, void* const pDevice, const size_t size,
              const StreamID& stream = DEF_MEM_STREAM);
 /**
 * Pulls memory from the device back to the host.
 */
 StreamID pull(void* const pHost, void* const pDevice, const size_t size,
              const StreamID& stream = DEF_MEM_STREAM);
 /**
 * Copies memory on the device to another location on the device.
 */
 StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
                    const StreamID& stream = DEF_MEM_STREAM);
 /**
 * Frees memory on the device.
 */
 void free(void* const pDevice);
 #ifdef CUDACC
 cudaDeviceProp getDeviceProp();
 static cudaDeviceProp DeviceProperties = getDeviceProp();
 const char* cublasGetErrorString(cublasStatus_t status);
 #endif
 /**
 * A class that manages various CUDA Runtime components, such as
 * streams, events, and handles.
 */
 class Manager {
  private:
    static Manager mManagerInstance;
    Manager(const std::vector<std::string>& names);
    ~Manager();
 #ifdef CUDACC
    std::unordered_map<std::string, cudaStream_t> mStreams;
    cublasHandle_t mCublas;
 #endif
  public:
    /**
     * Used to get the global CudaTools::Manager instance.
     */
    static Manager* get() { return &mManagerInstance; };
    void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
    void sync() const;                          /**< Waits until all device code has finished. */
    void addStream(const std::string& name);    /**< Creates a stream with the given name. */
 #ifdef CUDACC
    cudaStream_t stream(const StreamID& stream) const;
    cublasHandle_t cublasHandle() const;
 #endif
 };
 namespace Kernel {
 /**
 * A struct that contains the kernel launch parameters.
 */
 struct Settings {
  public:
 #ifdef CUDACC
    dim3 blockGrid;
    dim3 threadBlock;
    size_t sharedMemoryBytes = 0;
 #else
    size_t threads;
 #endif
    StreamID stream;
    Settings() = default;
    void setGridDim(const size_t x);                 /**< Sets the Grid dimensions. */
    void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
    void setGridDim(const size_t x, const size_t y,
                    const size_t z);                  /**< Sets the Grid dimensions. */
    void setBlockDim(const size_t x);                 /**< Sets the Thread Block dimensions. */
    void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
    void setBlockDim(const size_t x, const size_t y,
                     const size_t z); /**< Sets the Thread Block dimensions. */
    void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
    void setStream(const StreamID& stream);    /**< Sets the stream. */
 };
 /**
 * Returns a kernel launch parameters based on the number of threads, and optionally
 * a stream. Should only be used for 'embarassingly parallel' situations, or where
 * each thread corresponds some sort of index.
 */
 Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);
 }; // namespace Kernel
 template <typename T> class Array;
 /**
 * A class that holds information about an Array.
 */
 class Shape {
  private:
    template <typename T> friend class Array;
    uint32_t mAxes;
    uint32_t mItems;
    uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
    uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};
  public:
    HD Shape() : mAxes(0), mItems(1){};
    /**
     * The constructor for a Shape.
     * \param dims an initializer list of the dimensions.
     */
    HD Shape(const std::initializer_list<uint32_t> dims);
    HD uint32_t axes() const;  /**< Gets the number of axes. */
    HD uint32_t items() const; /**< Gets the total number of items. */
    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
                                   of the last axis. */
    HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
                                   dimension of the second to last axis. */
    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets the
                                   dimension of the second to last axis. */
    HD uint32_t
    dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
    HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */
    /**
     * Gets the shape at a specific axis of this shape.
     * \param axis the axis of where the new shape starts.
     */
    HD Shape subshape(const uint32_t axis) const;
    HD bool operator==(const Shape& s) const; /**< Equals operator. */
    HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
 };
 std::ostream& operator<<(std::ostream& out, const Shape& s);
 }; // namespace CudaTools
 #ifdef CUDATOOLS_IMPLEMENTATION
 namespace CudaTools {
 template <typename T, typename... Args>
 StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) {
 #ifdef CUDA
    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
           Manager::get()->stream(sett.stream.id)>>>(args...);
 #else
    func(args...);
 #endif
    return sett.stream;
 }
 ////////////////////
 // Memory Methods //
 ////////////////////
 void StreamID::wait() const { Manager::get()->waitFor(id); }
 void* malloc(const size_t size) {
 #ifdef CUDACC
    void* pDevice;
    CUDA_CHECK(cudaMalloc(&pDevice, size));
    return pDevice;
 #else
    return nullptr;
 #endif
 }
 void free(void* const pDevice) {
 #ifdef CUDACC
    if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
 #endif
 }
 StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
 #ifdef CUDACC
    CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
                               Manager::get()->stream(stream.id)));
 #endif
    return stream;
 }
 StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
 #ifdef CUDACC
    CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
                               Manager::get()->stream(stream.id)));
 #endif
    return stream;
 }
 StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
                    const StreamID& stream) {
 #ifdef CUDACC
    CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
                               Manager::get()->stream(stream.id)));
 #endif
    return stream;
 }
 void pin(void* const pHost, const size_t size) {
 #ifdef CUDACC
    CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
 #endif
 }
 #ifdef CUDACC
 cudaDeviceProp getDeviceProp() {
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    return deviceProp;
 }
 #endif
 /////////////////////
 // Manager Methods //
 /////////////////////
 Manager::Manager(const std::vector<std::string>& names) {
 #ifdef CUDACC
    for (auto name : names) {
        addStream(name);
    }
    CUBLAS_CHECK(cublasCreate(&mCublas));
 #endif
 }
 Manager::~Manager() {
 #ifdef CUDACC
    for (auto& it : mStreams) {
        CUDA_CHECK(cudaStreamDestroy(it.second));
    }
    CUBLAS_CHECK(cublasDestroy(mCublas));
 #endif
 }
 void Manager::waitFor(const StreamID& stream) const {
 #ifdef CUDACC
    auto it = mStreams.find(stream.id);
    if (it != mStreams.end()) {
        CUDA_CHECK(cudaStreamSynchronize(it->second));
    } else {
        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
    }
 #endif
 }
 void Manager::sync() const {
 #ifdef CUDACC
    CUDA_CHECK(cudaDeviceSynchronize());
 #endif
 }
 void Manager::addStream(const std::string& name) {
 #ifdef CUDACC
    cudaStream_t s;
    CUDA_CHECK(cudaStreamCreate(&s));
    mStreams[name] = s;
 #endif
 }
 #ifdef CUDACC
 cudaStream_t Manager::stream(const StreamID& stream) const {
    auto it = mStreams.find(stream.id);
    if (it != mStreams.end()) {
        return it->second;
    } else {
        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
    }
 }
 cublasHandle_t Manager::cublasHandle() const { return mCublas; };
 Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
 #else
 Manager Manager::mManagerInstance = Manager({""});
 #endif
 ////////////////////
 // Kernel Methods //
 ////////////////////
 namespace Kernel {
 void Settings::setGridDim(const size_t x) {
 #ifdef CUDACC
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
    blockGrid.x = x;
    blockGrid.y = 1;
    blockGrid.z = 1;
 #endif
 }
 void Settings::setGridDim(const size_t x, const size_t y) {
 #ifdef CUDACC
    CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
    blockGrid.x = x;
    blockGrid.y = y;
    blockGrid.z = 1;
 #endif
 }
 void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
 #ifdef CUDACC
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
    CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
    blockGrid.x = x;
    blockGrid.y = y;
    blockGrid.z = z;
 #endif
 }
 void Settings::setBlockDim(const size_t x) {
 #ifdef CUDACC
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    threadBlock.x = x;
    threadBlock.y = 1;
    threadBlock.z = 1;
 #endif
 }
 void Settings::setBlockDim(const size_t x, const size_t y) {
 #ifdef CUDACC
    CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
    threadBlock.x = x;
    threadBlock.y = y;
    threadBlock.z = 1;
 #endif
 }
 void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
 #ifdef CUDACC
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
    CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
    threadBlock.x = x;
    threadBlock.y = y;
    threadBlock.z = z;
 #endif
 }
 void Settings::setSharedMemSize(const size_t bytes) {
 #ifdef CUDACC
    sharedMemoryBytes = bytes;
 #endif
 }
 void Settings::setStream(const StreamID& stream_) {
 #ifdef CUDACC
    stream.id = stream_.id;
 #endif
 }
 Settings basic(const size_t threads, const StreamID& stream) {
    Settings sett;
 #ifdef CUDACC
    auto max_threads = DeviceProperties.maxThreadsPerBlock;
    size_t grid_blocks = (threads + max_threads - 1) / max_threads;   // ceil(threads / max_threads)
    size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
    sett.setGridDim(grid_blocks);
    sett.setBlockDim(block_threads);
    sett.setStream(stream);
 #else
    sett.threads = threads;
 #endif
    return sett;
 }
 } // namespace Kernel
 /////////////////////
 // Shape Functions //
 /////////////////////
 HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
    CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
    mAxes = dims.size();
    if (mAxes == 0) return;
    auto it = dims.end() - 1;
    mItems = 1;
    for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
        uint32_t dim = *it;
        CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");
        mAxisDim[iAxis] = dim;
        mStride[iAxis] = mItems;
        mItems *= dim;
        --it;
    }
    if (mAxes == 1) return;
    // Swap last two, for column major storage.
    mStride[mAxes - 2] = 1;
    mStride[mAxes - 1] = mAxisDim[mAxes - 2];
 }
 HD uint32_t Shape::axes() const { return mAxes; };
 HD uint32_t Shape::items() const { return mItems; };
 HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }
 HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }
 HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }
 HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
 HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }
 HD bool Shape::operator==(const Shape& s) const {
    if (mAxes != s.mAxes) {
        return false;
    }
    for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
        if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
            return false;
        }
    }
    return true;
 }
 HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }
 HD Shape Shape::subshape(const uint32_t axis) const {
    CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
    if (axis == mAxes) return Shape({1});
    Shape new_shape({});
    new_shape.mAxes = mAxes - axis;
    new_shape.mItems = mItems;
    for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
        new_shape.mItems /= mAxisDim[iAxis];
    }
    for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
        new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
        new_shape.mStride[iAxis - axis] = mStride[iAxis];
    }
    return new_shape;
 }
 std::ostream& operator<<(std::ostream& out, const Shape& s) {
    out << "(";
    if (s.axes() == 0) return out << ")";
    for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
        out << s.dim(iAxis) << ", ";
    }
    return out << s.dim(s.axes() - 1) << ")";
 }
 #ifdef CUDACC
 const char* cublasGetErrorString(cublasStatus_t error) {
    switch (error) {
    case CUBLAS_STATUS_SUCCESS:
        return "CUBLAS_STATUS_SUCCESS";
    case CUBLAS_STATUS_NOT_INITIALIZED:
        return "CUBLAS_STATUS_NOT_INITIALIZED";
    case CUBLAS_STATUS_ALLOC_FAILED:
        return "CUBLAS_STATUS_ALLOC_FAILED";
    case CUBLAS_STATUS_INVALID_VALUE:
        return "CUBLAS_STATUS_INVALID_VALUE";
    case CUBLAS_STATUS_ARCH_MISMATCH:
        return "CUBLAS_STATUS_ARCH_MISMATCH";
    case CUBLAS_STATUS_MAPPING_ERROR:
        return "CUBLAS_STATUS_MAPPING_ERROR";
    case CUBLAS_STATUS_EXECUTION_FAILED:
        return "CUBLAS_STATUS_EXECUTION_FAILED";
    case CUBLAS_STATUS_INTERNAL_ERROR:
        return "CUBLAS_STATUS_INTERNAL_ERROR";
    }
    return "<unknown>";
 }
 #endif
 };     // namespace CudaTools
 #endif // CUDATOOLS_IMPLEMENTATION
 #endif // CUDATOOLS_H
--- a/Macros.h
+++ b/Macros.h
@ -0,0 +1,297 @@
 #ifndef MACROS_H
 #define MACROS_H
 #include <exception>
 #include <sstream>
 #include <stdarg.h>
 #if defined(CUDA) && defined(__CUDACC__)
 #define CUDACC
 #endif
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
 #define DEVICE
 #endif
 #ifdef CUDATOOLS_DOXYGEN
 /**
 * \def CUDACC
 * This macro is defined when this code is being compiled by nvcc and the CUDA compilation
 * flag is set. This should be used to enclose code where CUDA specific libraries and syntax are
 * being used.
 */
 #define CUDACC
 /**
 * \def DEVICE
 * This macro is defined when this code is being compiled for the device. The difference between
 * this and CUDACC is that this should exclusively be used to dcide if code is being compiled
 * to execute on the device. CUDACC is only determines what compiler is being used.
 */
 #define DEVICE
 /**
 * \def HD
 * Mark a function in front with this if it needs to be callable on both the
 * CPU and CUDA device.
 */
 #define HD
 /**
 * \def SHARED
 * Mark a variable as static shared memory.
 */
 #define SHARED
 /**
 * \def DECLARE_KERNEL(call, ...)
 * Used to declare (in header) a CUDA kernel.
 * \param call the name of the kernel
 * \param ... the arguments of the kernel
 */
 #define DECLARE_KERNEL(call, ...)
 /**
 * \def DEFINE_KERNEL(call, ...)
 * Used to define (in implementation) a CUDA kernel.
 * \param call the name of the kernel
 * \param ... the arguments of the kernel
 */
 #define DEFINE_KERNEL(call, ...)
 /**
 * \def KERNEL(call, settings, ...)
 * Used to call a CUDA kernel.
 * \param call the name of the kernel
 * \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with
 * \param ... the arguments of the kernel
 */
 #define KERNEL(call, settings, ...)
 /**
 * \def BASIC_LOOP(N)
 * Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly
 * parallel situations. Exposes the loop/thread number as iThread.
 * \param N number of iterations
 */
 #define BASIC_LOOP(N)
 /**
 * \def DEVICE_CLASS(name)
 * Can be used inside a class declaration (header) which generates boilerplate code to allow this
 * class to be used on the device.
 *
 * This macro creates a few functions:\n
 * name* that(): returns the pointer to this instance on the device.
 *
 * void allocateDevice(): allocates the memory on the device for this class instance.
 *
 * CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance
 * of the class.
 *
 * CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates
 * the device instance of the class.
 * \param name the name of the class
 */
 #define DEVICE_CLASS(name)
 /**
 * \def CT_ERROR_IF(a, op, b, msg)
 * Used for throwing runtime errors given a condition with an operator.
 */
 #define CT_ERROR_IF(a, op, b, msg)
 /**
 * \def CT_ERROR(a, msg)
 * Used for throwing runtime errors given a bool.
 */
 #define CT_ERROR(a, msg)
 /**
 * \def CUDA_CHECK(call)
 * Gets the error generated by a CUDA function call if there is one.
 * \param call CUDA function to check if there are errors when running.
 */
 #define CUDA_CHECK(call)
 /**
 * \def CUBLAS_CHECK(call)
 * Gets the error generated by a cuBLAS function call if there is one.
 * \param call cuBLAS function to check if there are errors when running.
 */
 #define CUBLAS_CHECK(call)
 /**
 * \def CUDA_MEM(call)
 * Gets the GPU memory used from function call if there is one.
 * \param call function to measure memory usage.
 * \param name an identifier to use as a variable and when printing. Must satisfy variable naming.
 */
 #define CUDA_MEM(call, name)
 #endif
 ///////////////////
 // KERNEL MACROS //
 ///////////////////
 #ifdef CUDACC
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #define HD __host__ __device__
 #define SHARED __shared__
 #define DECLARE_KERNEL(call, ...) __global__ void call(__VA_ARGS__)
 #define DEFINE_KERNEL(call, ...)                                                                   \
    template CudaTools::StreamID CudaTools::runKernel(                                             \
        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
    __global__ void call(__VA_ARGS__)
 #else
 #define HD
 #define SHARED
 #define DECLARE_KERNEL(call, ...) void call(__VA_ARGS__)
 #define DEFINE_KERNEL(call, ...)                                                                   \
    template CudaTools::StreamID CudaTools::runKernel(                                             \
        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
    void call(__VA_ARGS__)
 #endif // CUDACC
 #define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
 ///////////////////
 // DEVICE MACROS //
 ///////////////////
 #ifdef DEVICE
 #define BASIC_LOOP(N)                                                                              \
    uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x;                                      \
    if (iThread < N)
 #else
 #define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread)
 #endif
 //////////////////
 // CLASS MACROS //
 //////////////////
 #define UPDATE_FUNC(name)                                                                          \
    inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream =                      \
                                              CudaTools::DEF_MEM_STREAM) {                         \
        return CudaTools::pull(this, that(), sizeof(name));                                        \
    };                                                                                             \
    inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream =                    \
                                                CudaTools::DEF_MEM_STREAM) {                       \
        return CudaTools::push(this, that(), sizeof(name));                                        \
    }
 #ifdef CUDA
 #define DEVICE_CLASS(name)                                                                         \
  private:                                                                                         \
    name* __deviceInstance__ = nullptr;                                                            \
                                                                                                   \
  public:                                                                                          \
    inline name* that() { return __deviceInstance__; }                                             \
    inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \
    UPDATE_FUNC(name)
 #else
 #define DEVICE_CLASS(name)                                                                         \
  public:                                                                                          \
    inline name* that() { return this; };                                                          \
    inline void allocateDevice(){};                                                                \
    UPDATE_FUNC(name)
 #endif
 #ifndef CUDATOOLS_ARRAY_MAX_AXES
 /**
 * \def CUDATOOLS_ARRAY_MAX_AXES
 * The maximum number of axes/dimensions an CudaTools::Array can have. The default is
 * set to 4, but can be manully set fit the program needs.
 */
 #define CUDATOOLS_ARRAY_MAX_AXES 4
 #endif
 ////////////////////
 // Error Checking //
 ////////////////////
 #ifndef NO_DIMENSION_CHECK
 #ifdef DEVICE
 #define CT_ERROR_IF(a, op, b, msg)                                                                 \
    if (a op b) {                                                                                  \
        printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg);     \
    }
 #define CT_ERROR(a, msg)                                                                           \
    if (a) {                                                                                       \
        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
    }
 #else
 #define CT_ERROR_IF(a, op, b, msg)                                                                 \
    if (a op b) {                                                                                  \
        std::ostringstream os_a;                                                                   \
        std::ostringstream os_b;                                                                   \
        os_a << a;                                                                                 \
        os_b << b;                                                                                 \
        printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg,  \
               os_a.str().c_str(), os_b.str().c_str());                                            \
        throw std::exception();                                                                    \
    }
 #define CT_ERROR(a, msg)                                                                           \
    if (a) {                                                                                       \
        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
        throw std::exception();                                                                    \
    }
 #endif
 #endif // NO_DIMENSION_CHECK
 #if defined(CUDACC) && !defined(NO_CUDA_CHECK)
 #define CUDA_CHECK(call)                                                                           \
    do {                                                                                           \
        cudaError_t err = (call);                                                                  \
        if (err != cudaSuccess) {                                                                  \
            printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err));          \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
 #define CUBLAS_CHECK(call)                                                                         \
    do {                                                                                           \
        cublasStatus_t err = (call);                                                               \
        if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__,                                  \
                   CudaTools::cublasGetErrorString(err));                                          \
            throw std::exception();                                                                \
        }                                                                                          \
    } while (0)
 #define CUDA_MEM(call, name)                                                                       \
    size_t free_bef_##name, free_aft_##name;                                                       \
    cudaMemGetInfo(&free_bef_##name, NULL);                                                        \
    call;                                                                                          \
    CudaTools::Manager::get()->sync();                                                             \
    cudaMemGetInfo(&free_aft_##name, NULL);                                                        \
    printf("[%s] GPU Memory Usage: %iMiB\n", #name,                                                \
           (free_bef_##name - free_aft_##name) / (1024 * 1024));
 #else
 #define CUDA_CHECK(call) (call)
 #define CUBLAS_CHECK(call) (call)
 #define CUDA_MEM(call, name) (call)
 #endif
 #endif // MACROS_H
--- a/95
+++ b/95
@ -0,0 +1,95 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE :=
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
 LIBS_GPU := cuda cudart cublas
 TARGET = tests
 SRC_DIR = .
 BUILD_DIR = build
 # Should not need to modify below.
 CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 # Get source files and object files.
 GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 # If compiling for CPU, all go to GCC. Otherwise, they are split.
 CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 # $(info $$GCC_SRC is [${GCC_SRC}])
 # $(info $$NVCC_SRC is [${NVCC_SRC}])
 # $(info $$GCC_OBJ is [${GCC_OBJ}])
 # $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 # $(info $$CPU_OBJ is [${CPU_OBJ}])
 # $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 # $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 INC := $(INCLUDE:%=-I%)
 LIB := $(LIBS_DIR:%=-L%)
 LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 LD := $(LIBS:%=-l%)
 LD_GPU := $(LIBS_GPU:%=-l%)
 # Reminder:
 # $< = first prerequisite
 # $@ = the target which matched the rule
 # $^ = all prerequisites
 .PHONY: all clean
 all : cpu gpu
 cpu: $(TARGET)CPU
 gpu: $(TARGET)GPU
 $(TARGET)CPU: $(CPU_OBJ)
 	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 $(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 # For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 # regular ones. Then, we link them all together.
 $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
 	$(NVCC) --device-link $^ -o $@
 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 $(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 -include $(CPU_DEPS)
 -include $(GPU_DEPS)
 $(CPU_BUILD_DIR):
 	mkdir -p $@
 $(GPU_BUILD_DIR):
 	mkdir -p $@
 clean:
 	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,40 @@
 =========
 CudaTools
 =========
 This is the documentation for CudaTools, a header-only library and framework
 for the development of CPU-CUDA compatible applications. Using CudaTools enables
 the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to
 introduce ``#ifdef`` statements when code is essentially identical between the targets.
 For information on the library itself and its usage, view `documentation <https://acem.ece.illinois.edu/code/CudaTools>`__. The small code snippets and samples
 seen in the documentation are in the folder ``samples``.
 Dependencies
 ============
 - Eigen
 In the future, we will make this dependency optional, but still provide support
 for it. As of now, it is necessary.
 Building the Documentation
 ==========================
 The documentation is built with `Doxygen <https://doxygen.nl/>`__ and `Sphinx <https://www.sphinx-doc.org/en>`__.
 So, first make sure you have Doxygen installed on your system, and make sure it is added
 to your system path. Then, you will have to create a Python virtual environment
 in the repository folder
 .. code-block:: bash
    $ python3 -m venv .venv
 After installing the required Python packages
 .. code-block:: bash
    $ pip install -r requirements
 you can now run the script
 .. code-block:: bash
    $ ./build_docs
--- a/2
+++ b/2
@ -0,0 +1,2 @@
 doxygen docs/Doxyfile
 sphinx-build -b html docs/source docs/build/html
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,20 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/make.bat
+++ b/docs/make.bat
@ -0,0 +1,35 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=source
 set BUILDDIR=build
 if "%1" == "" goto help
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.http://sphinx-doc.org/
 	exit /b 1
 )
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
--- a/docs/source/_static/js/mathjax-config.js
+++ b/docs/source/_static/js/mathjax-config.js
@ -0,0 +1,39 @@
 sections = {
  "mesh_prep": 1,
  "matrix_assembly": 2,
  "bc_calc": 3,
  "timestep": 4,
 }
 window.MathJax = {
  loader: {load: ['[tex]/tagformat', '[tex]/ams']},
  tex: {
    packages: {'[+]': ['tagformat', 'ams']},
    macros: {
      dd: "{\\, \\mathrm{d}}",
      E: "{\\mathbf{E}}",
      H: "{\\mathbf{H}}",
      J: "{\\mathbf{J}}",
      D: "{\\mathbf{D}}",
      B: "{\\mathbf{B}}",
      M: "{\\mathbf{M}}",
      tbE: "{\\tilde{\\E}}",
      tbH: "{\\tilde{\\H}}",
      tE: "{\\tilde{E}}",
      tH: "{\\tilde{H}}",
      tphi: "{\\tilde{\\phi}}",
      curl: ["{\\nabla \\times {#1}}", 1],
      div: ["{\\nabla \\cdot {#1}}", 1],
      tens: ["{\\bar{\\bar{{#1}}}}", 1],
    },
    tags: 'ams',
    tagformat: {
      number: (n) => sections[window.location.pathname.split("/").pop().split(".")[0]] + '.' + n,
    },
    ams: {
      multilineWidth: '100%',
      multilineIndent: '50em'
    }
  },
 }
--- a/docs/source/array.rst
+++ b/docs/source/array.rst
@ -0,0 +1,26 @@
 =======
 Array.h
 =======
 The ``Array.h`` header file contains the Array class, and its related classes. For this
 file only, assume that every functions is callable on both host and device unless
 explicitly mentioned otherwise.
 CudaTools::Shape
 ----------------
 .. doxygenclass:: CudaTools::Shape
    :members:
    :allow-dot-graphs:
 CudaTools::ArrayIterator<T>
 ---------------------------
 .. doxygenclass:: CudaTools::ArrayIterator
    :members:
    :allow-dot-graphs:
 CudaTools::Array<T>
 -------------------
 .. doxygenclass:: CudaTools::Array
    :members:
    :private-members:
    :allow-dot-graphs:
--- a/docs/source/blas.rst
+++ b/docs/source/blas.rst
@ -0,0 +1,45 @@
 ======
 BLAS.h
 ======
 The ``BLAS.h`` header file contains some BLAS functions, and some related
 classes for those functions.
 BLAS Functions
 ==============
 Currently, these are the supported BLAS functions. They are inherited mainly
 from the cuBLAS API, and condensed into a unified functions. The plan is to
 add them as necessary.
 CudaTools::BLAS::GEMV<T>
 ------------------------
 .. doxygenfunction:: CudaTools::BLAS::GEMV
 CudaTools::BLAS::GEMM<T>
 ------------------------
 .. doxygenfunction:: CudaTools::BLAS::GEMM
 CudaTools::BLAS::DGMM<T>
 ------------------------
 .. doxygenfunction:: CudaTools::BLAS::DGMM
 BLAS Classes
 ============
 These classes also inherit functions from the cuBLAS API, but are packaged
 into classes that are more intuitive and hide external details.
 CudaTools::BLAS::Batch<T>
 -------------------------
 .. doxygenclass:: CudaTools::BLAS::Batch
    :members:
 CudaTools::BLAS::PLUArray<T>
 ----------------------------
 .. doxygenclass:: CudaTools::BLAS::PLUArray
    :members:
 CudaTools::BLAS::PLUBatch<T>
 ----------------------------
 .. doxygenclass:: CudaTools::BLAS::PLUBatch
    :members:
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -0,0 +1,53 @@
 # Configuration file for the Sphinx documentation builder.
 # -- Project information
 project = 'DGEMS'
 copyright = '2022'
 author = 'Kenneth Jao, Qi Jian Lim'
 release = '0.1'
 version = '0.1.0'
 # -- General configuration
 html_static_path = ["_static"]
 html_js_files = ["js/mathjax-config.js"]
 extensions = [
    'sphinx.ext.duration',
    'sphinx.ext.doctest',
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
    'sphinx.ext.autosectionlabel',
    'sphinx.ext.intersphinx',
    'sphinx.ext.mathjax',
    'sphinx.ext.graphviz',
    'sphinxcontrib.bibtex',
    'breathe',
 ]
 breathe_projects = {"DGEMS": "../build/xml"}
 breathe_default_project = "DGEMS"
 bibtex_bibfiles = ['refs.bib']
 mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"
 intersphinx_mapping = {
    'python': ('https://docs.python.org/3/', None),
    'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
 }
 intersphinx_disabled_domains = ['std']
 templates_path = ['_templates']
 # -- Options for HTML output
 html_theme = 'sphinx_rtd_theme'
 html_theme_options = {
    'collapse_navigation': False,
 }
 # -- Options for EPUB output
 epub_show_urls = 'footnote'
--- a/docs/source/core.rst
+++ b/docs/source/core.rst
@ -0,0 +1,67 @@
 ======
 Core.h
 ======
 The ``Core.h`` header file defines several compiler flags and macros along with
 a few core classes.
 Flags
 =====
 Device Indicators
 -----------------
 .. doxygendefine:: CUDACC
 .. doxygendefine:: DEVICE
 Host-Device Automation
 ----------------------
 .. doxygendefine:: HD
 .. doxygendefine:: SHARED
 Compilation Options
 -------------------
 .. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES
 Macros
 ======
 Kernel
 ------
 .. doxygendefine:: DECLARE_KERNEL
 .. doxygendefine:: DEFINE_KERNEL
 .. doxygendefine:: KERNEL
 Device Helpers
 --------------
 .. doxygendefine:: BASIC_LOOP
 Device Class
 ------------
 .. doxygendefine:: DEVICE_CLASS
 Classes and Structs
 ===================
 CudaTools::StreamID
 -------------------
 .. doxygenstruct:: CudaTools::StreamID
 CudaTools::Manager
 ------------------
 .. doxygenclass:: CudaTools::Manager
    :members:
 CudaTools::Kernel::Settings
 ---------------------------
 .. doxygenstruct:: CudaTools::Kernel::Settings
    :members:
 CudaTools::Kernel::Basic
 ------------------------
 .. doxygenfunction:: CudaTools::Kernel::basic
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -0,0 +1,25 @@
 =========
 CudaTools
 =========
 This is the documentation for CudaTools, a header-only library and framework
 for the development of CPU-CUDA compatible applications. Using CudaTools enables
 the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to
 introduce ``#ifdef`` statements when code is essentially identical between the targets.
 To get started, please head over to the :doc:`usage` section. For more detail on the
 machinery underneath, please refer to the other other sections.
 .. note::
   If you would like to contribute, please visit the `git page <https://git.acem.ece.illinois.edu/kjao/CudaTools>`__.
 Contents
 ========
 .. toctree::
   :maxdepth: 2
   usage
   core
   array
   blas
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@ -0,0 +1,128 @@
 ==================
 Usage and Examples
 ==================
 This library is broken up into three main parts, as well as a certain
 compilation and linking framework:
 #. :ref:`Core Examples`
 #. :ref:`Array Examples`
 #. :ref:`BLAS Examples`
 #. :ref:`Compilation and Linking`
 The ``Core.h`` header contains the necessary macros, flags and objects for interfacing with
 basic kernel launching and the CUDA Runtime API. The ``Array.h`` header contains the ``CudaTools::Array``
 class which provides a device compatible Array-like class with easy memory management. Lastly,
 the ``BLAS.h`` header provides functions BLAS functions through the the cuBLAS library for the GPU,
 and Eigen for the CPU. Lastly, a templated Makefile is provided which can be used
 for your own project, after following a few rules.
 The usage of this libary will be illustrated through examples, and further details
 can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
 Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
 to a function runnable on the CPU, and a device function refers to a function that is runnable
 on a device. A kernel is a specific function that the host can call to be run on the device.
 Core Examples
 =============
 This file mainly introduces compiler macros and a few classes that are used to improve the
 syntax between host and device code. To define and call a kernel, there are a few
 macros provided. For example,
 .. code-block:: cpp
    DEFINE_KERNEL(add, int x, int y) {
        printf("Kernel: %i\n", x + y);
    }
    int main() {
        KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
        return 0;
    }
 The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
 The second argument in the ``KERNEL()`` macro is are the launch parameters for
 kernel. The launch parameters have several items, but for 'embarassingly parallel'
 cases, we can simply generate the settings with the number of threads. More detail with
 creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
 there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
 see :ref:`here <Macros>`.
 .. warning::
   These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
   for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
   and make it available to other files.
 Since many applications used classes, a macro is provided to 'convert' a class into
 being device-compatible. Following the previous example similarly,
 .. code-block:: cpp
    class intPair {
        DEVICE_CLASS(intPair)
        public:
            int x, y;
            intPair(const int x_, const int y_) : x(x_), y(y_) {
                allocateDevice(); // Allocates memory for this intPair on the device.
                updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
            };
            HD void swap() {
                int swap = x;
                x = y;
                y = swap;
            };
    };
    DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
    int main() {
        intPair pair(1, 2);
        printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
        KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
        pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
        printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
        return 0;
    }
 In this example, we create a class called ``intPair``, which is then made available on the device through
 the ``DEVICE_CLASS(name)`` macro. Specifically, that macro introduces a few functions, like
 ``allocateDevice()``, ``updateDevice()``, ``updateHost()``, and ``that()``. That last function
 returns a pointer to the copy on the device. For more details, see :ref:`here <Device Class>`. If we were to pass in the host pointer of the ``intPair`` to the kernel, there would be a illegal memory access.
 The kernel argument list should **must** consist of pointers to objects, or a non-reference object.
 Otherwise, compilation will fail. In general this is safer, as it forces the programmer to
 acknowledge that the device copy is being passed. For the latter case of a non-reference object,
 you should only do this if there is no issue in creating a copy of the original object. In the above
 example, we could have done this, but for more complicated classes it may result in unwanted behavior.
 Lastly, since the point of classes is usually to have some member functions, to have them
 available on the device, you must mark them with the compiler macro ``HD`` in front.
 We also introduce the ``wait()`` function, which waits for the command to complete before
 continuing. Most calls that involve the device are asynchronous, so without proper blocking,
 operations dependent on a previous command are not guaranteed to run correctly. If the code is
 compiled for CPU, then everything will run synchronously, as per usual.
 .. note::
   Almost all functions that are asynchronous provide an optional 'stream' argument,
   where you can give the name of the stream you wish to use. Different streams run
   asynchronous, but operations on the same stream are FIFO. To define a stream to use
   later, you must call ``CudaTools::Manager::get()->addStream("myStream")`` at some point
   before you use it. For more details, see :ref:`here <CudaTools::Manager>`.
 Array Examples
 ==============
 BLAS Examples
 =============
 Compilation and Linking
 =======================
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
 Sphinx>=5.1.1
 sphinx-rtd-theme>=1.0.0
 sphinxcontrib-bibtex>=2.5.0
 breathe>=4.34.0
--- a/samples/1_CoreKernel/Makefile
+++ b/samples/1_CoreKernel/Makefile
@ -0,0 +1,95 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
 LIBS_GPU := cuda cudart cublas
 TARGET = coreKernel
 SRC_DIR = .
 BUILD_DIR = build
 # Should not need to modify below.
 CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 # Get source files and object files.
 GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 # If compiling for CPU, all go to GCC. Otherwise, they are split.
 CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 # $(info $$GCC_SRC is [${GCC_SRC}])
 # $(info $$NVCC_SRC is [${NVCC_SRC}])
 # $(info $$GCC_OBJ is [${GCC_OBJ}])
 # $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 # $(info $$CPU_OBJ is [${CPU_OBJ}])
 # $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 # $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 INC := $(INCLUDE:%=-I%)
 LIB := $(LIBS_DIR:%=-L%)
 LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 LD := $(LIBS:%=-l%)
 LD_GPU := $(LIBS_GPU:%=-l%)
 # Reminder:
 # $< = first prerequisite
 # $@ = the target which matched the rule
 # $^ = all prerequisites
 .PHONY: all clean
 all : cpu gpu
 cpu: $(TARGET)CPU
 gpu: $(TARGET)GPU
 $(TARGET)CPU: $(CPU_OBJ)
 	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 $(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 # For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 # regular ones. Then, we link them all together.
 $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
 	$(NVCC) --device-link $^ -o $@
 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 $(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 -include $(CPU_DEPS)
 -include $(GPU_DEPS)
 $(CPU_BUILD_DIR):
 	mkdir -p $@
 $(GPU_BUILD_DIR):
 	mkdir -p $@
 clean:
 	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/samples/1_CoreKernel/main.cu.cpp
+++ b/samples/1_CoreKernel/main.cu.cpp
@ -0,0 +1,12 @@
 #define CUDATOOLS_IMPLEMENTATION
 #include <Core.h>
 DEFINE_KERNEL(add, int x, int y) {
    printf("Kernel: %i\n", x + y);
 }
 int main() {
    KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
    return 0;
 }
--- a/samples/2_CoreClass/Makefile
+++ b/samples/2_CoreClass/Makefile
@ -0,0 +1,95 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
 LIBS_GPU := cuda cudart cublas
 TARGET = coreClass
 SRC_DIR = .
 BUILD_DIR = build
 # Should not need to modify below.
 CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 # Get source files and object files.
 GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 # If compiling for CPU, all go to GCC. Otherwise, they are split.
 CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 # $(info $$GCC_SRC is [${GCC_SRC}])
 # $(info $$NVCC_SRC is [${NVCC_SRC}])
 # $(info $$GCC_OBJ is [${GCC_OBJ}])
 # $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 # $(info $$CPU_OBJ is [${CPU_OBJ}])
 # $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 # $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 INC := $(INCLUDE:%=-I%)
 LIB := $(LIBS_DIR:%=-L%)
 LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 LD := $(LIBS:%=-l%)
 LD_GPU := $(LIBS_GPU:%=-l%)
 # Reminder:
 # $< = first prerequisite
 # $@ = the target which matched the rule
 # $^ = all prerequisites
 .PHONY: all clean
 all : cpu gpu
 cpu: $(TARGET)CPU
 gpu: $(TARGET)GPU
 $(TARGET)CPU: $(CPU_OBJ)
 	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 $(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 # For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 # regular ones. Then, we link them all together.
 $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
 	$(NVCC) --device-link $^ -o $@
 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 $(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 -include $(CPU_DEPS)
 -include $(GPU_DEPS)
 $(CPU_BUILD_DIR):
 	mkdir -p $@
 $(GPU_BUILD_DIR):
 	mkdir -p $@
 clean:
 	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/samples/2_CoreClass/main.cu.cpp
+++ b/samples/2_CoreClass/main.cu.cpp
@ -0,0 +1,34 @@
 #define CUDATOOLS_IMPLEMENTATION
 #include <Core.h>
 class intPair {
    DEVICE_CLASS(intPair)
    public:
        int x, y;
        intPair(const int x_, const int y_) : x(x_), y(y_) {
            allocateDevice(); // Allocates memory for this intPair on the device.
            updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
        };
        HD void swap() {
            int swap = x;
            x = y;
            y = swap;
        };
 };
 DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
 int main() {
    intPair pair(1, 2);
    printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
    KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
    pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
    printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
    return 0;
 }
--- a/tests.cu.cpp
+++ b/tests.cu.cpp
@ -0,0 +1,494 @@
 #define CUDATOOLS_IMPLEMENTATION
 #define CUDATOOLS_ARRAY_MAX_AXES 8
 #include "Array.h"
 #include "BLAS.h"
 #include "Core.h"
 #include <Eigen/Core>
 #include <chrono>
 #include <complex>
 namespace CT = CudaTools;
 /////////////
 // Helpers //
 /////////////
 #define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
 #define TIME_END(name)                                                                             \
    auto end_##name = std::chrono::steady_clock::now();                                            \
    auto time_ms_##name =                                                                          \
        std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count();  \
    auto time_mus_##name =                                                                         \
        std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count();  \
    if (time_ms_##name == 0) {                                                                     \
        printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name);                            \
    } else {                                                                                       \
        printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name);                             \
    }
 #define TIME(call, name)                                                                           \
    TIME_START(name);                                                                              \
    call;                                                                                          \
    TIME_END(name);
 #define TEST(predicate, name, msg)                                                                 \
    failed += (predicate) ? 0 : 1;                                                                 \
    printf("[%s] ", (predicate) ? "\033[1;32mPASS\033[0m" : "\033[1;31mFAIL\033[0m");              \
    printf("%s | %s.\n", name, msg);
 template <typename T> struct Type;
 #define REGISTER_PARSE_TYPE(X)                                                                     \
    template <> struct Type<X> { static const std::string name; };                                 \
    const std::string Type<X>::name = #X
 REGISTER_PARSE_TYPE(uint8_t);
 REGISTER_PARSE_TYPE(int16_t);
 REGISTER_PARSE_TYPE(int32_t);
 REGISTER_PARSE_TYPE(float);
 REGISTER_PARSE_TYPE(double);
 std::string box(std::string str) {
    std::string tops(str.size() + 6, '#');
    return tops + "\n## " + str + " ##\n" + tops;
 }
 std::string box2(std::string str) {
    std::string tops(str.size() - 5, '-');
    return tops + "\n|| " + str + " ||\n" + tops;
 }
 std::string boxSmall(std::string str) {
    std::string tops(6, '-');
    return tops + "[ " + str + " ]" + tops;
 }
 std::string separator() {
    std::string line(40, '=');
    return "\n" + line + "\n";
 }
 template <typename T> std::string type() { return "\033[1;96m" + Type<T>::name + "\033[0m"; }
 CT::Shape makeRandom2DShape() {
    std::random_device rd;
    std::mt19937 mt(rd());
    std::uniform_int_distribution<uint32_t> dist(1, 15);
    return CT::Shape({dist(mt), dist(mt)});
 }
 ///////////
 // Tests //
 ///////////
 class TestClass {
    DEVICE_CLASS(TestClass);
  public:
    int x;
    TestClass(const int x) : x(x) {
        allocateDevice();
        updateDevice().wait();
    };
 };
 DEFINE_KERNEL(times, const CT::Array<int> arr) {
    BASIC_LOOP(arr.shape().length()) { arr[iThread] *= 2; }
 }
 DEFINE_KERNEL(classTest, TestClass* const test) { test->x = 100; }
 struct MacroTests {
    static uint32_t Kernel() {
        uint32_t failed = 0;
        CT::Array<int> A = CT::Array<int>::constant({10}, 1);
        A.updateDevice().wait();
        KERNEL(times, CT::Kernel::basic(A.shape().items()), A.view()).wait();
        A.updateHost().wait();
        uint32_t errors = 0;
        for (auto it = A.begin(); it != A.end(); ++it) {
            if (*it != 2) ++errors;
        }
        std::ostringstream msg;
        msg << "Errors: " << errors;
        TEST(errors == 0, "Kernel", msg.str().c_str());
        return failed;
    };
    static uint32_t Class() {
        uint32_t failed = 0;
        TestClass test(1);
        KERNEL(classTest, CT::Kernel::basic(1), test.that()).wait();
        test.updateHost().wait();
        TEST(test.x == 100, "Class", "Errors: 0");
        return failed;
    }
 };
 template <typename T> struct ArrayTests {
    static uint32_t Indexing() {
        uint32_t failed = 0;
        CT::Array<T> A = CT::Array<T>::range(0, 240);
        A.reshape({5, 3, 1, 4, 2, 1, 1, 2});
        uint32_t errors = 0;
        for (uint32_t i = 0; i < 5; ++i) {
            for (uint32_t j = 0; j < 3; ++j) {
                for (uint32_t k = 0; k < 4; ++k) {
                    for (uint32_t l = 0; l < 2; ++l) {
                        for (uint32_t m = 0; m < 2; ++m) {
                            if ((T)A[i][j][0][k][l][0][0][m] != (T)A[{i, j, 0, k, l, 0, 0, m}]) {
                                ++errors;
                            }
                        }
                    }
                }
            }
        }
        std::ostringstream msg;
        msg << "Errors: " << errors;
        TEST(errors == 0, "Element", msg.str().c_str());
        errors = 0;
        CT::Array<T> ApartGroup_1 = A[{2, 2}];
        CT::Array<T> ApartIndiv_1 = A[2][2];
        for (uint32_t k = 0; k < 4; ++k) {
            for (uint32_t l = 0; l < 2; ++l) {
                for (uint32_t m = 0; m < 2; ++m) {
                    if ((T)ApartIndiv_1[0][k][l][0][0][m] != (T)ApartGroup_1[{0, k, l, 0, 0, m}]) {
                        ++errors;
                    }
                }
            }
        }
        msg.str("");
        msg << "Errors: " << errors;
        TEST(errors == 0, "Axis (1/2)", msg.str().c_str());
        errors = 0;
        CT::Array<T> ApartGroup_2 = A[{3, 2, 0, 3}];
        CT::Array<T> ApartIndiv_2 = A[3][2][0][3];
        for (uint32_t l = 0; l < 2; ++l) {
            for (uint32_t m = 0; m < 2; ++m) {
                if ((T)ApartIndiv_2[l][0][0][m] != (T)ApartGroup_2[{l, 0, 0, m}]) {
                    ++errors;
                }
            }
        }
        msg.str("");
        msg << "Errors: " << errors;
        TEST(errors == 0, "Axis (2/2)", msg.str().c_str());
        return failed;
    };
    static uint32_t Slicing() {
        uint32_t failed = 0;
        CT::Array<T> A = CT::Array<T>::constant({4, 5, 5}, 0);
        CT::Array<T> Aslice = A.slice({{0, 3}, {1, 4}, {1, 4}});
        T num = (T)1;
        for (auto it = Aslice.begin(); it != Aslice.end(); ++it) {
            *it = num;
            ++num;
        }
        CT::Array<T> Aslice2 = A[3].slice({{0, 5}, {0, 1}});
        num = (T)-1;
        for (auto it = Aslice2.begin(); it != Aslice2.end(); ++it) {
            *it = num;
            --num;
        }
        uint32_t errors = 0;
        for (int i = 0; i < 3; ++i) {
            for (int j = 0; j < 3; ++j) {
                for (int k = 0; k < 3; ++k) {
                    if ((T)A[i][1 + j][1 + k] != (T)(9 * i + 3 * j + k + 1)) {
                        ++errors;
                    }
                }
            }
        }
        std::ostringstream msg;
        msg << "Errors: " << errors;
        TEST(errors == 0, "Block", msg.str().c_str());
        errors = 0;
        for (int i = 0; i < 5; ++i) {
            if ((T)A[3][i][0] != (T)(-(i + 1))) {
                ++errors;
            }
        }
        msg.str("");
        msg << "Errors: " << errors;
        TEST(errors == 0, "Column", msg.str().c_str());
        return failed;
    }
 };
 template <typename T> struct BLASTests {
    static double thres;
    static uint32_t GEMV(int attempts) {
        uint32_t failed = 0;
        for (int i = 0; i < attempts; i++) {
            CT::Shape Ashape = makeRandom2DShape();
            CT::Shape xshape = CT::Shape({Ashape.cols(), 1});
            CT::Shape yshape = CT::Shape({Ashape.rows(), 1});
            CT::Array<T> A(Ashape);
            CT::Array<T> x(xshape);
            CT::Array<T> y(yshape);
            A.setRandom(-100, 100);
            x.setRandom(-100, 100);
            A.updateDevice();
            x.updateDevice().wait();
            CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait();
            y.updateHost().wait();
            CT::Array<T> yTest(yshape, true);
            yTest.eigenMap() = A.eigenMap() * x.eigenMap();
            double norm = (y.eigenMap() - y.eigenMap()).norm();
            std::ostringstream name;
            name << "GEMV (" << i + 1 << "/" << attempts << ")";
            std::ostringstream msg;
            msg << "Matrix Shape: " << Ashape << ", "
                << "Residual: " << norm;
            TEST(norm < thres, name.str().c_str(), msg.str().c_str());
        }
        return failed;
    };
    static uint32_t GEMVBroadcast() {
        uint32_t failed = 0;
        CT::Shape Ashape = makeRandom2DShape();
        CT::Shape xshape = CT::Shape({Ashape.cols(), 1});
        CT::Shape yshape = CT::Shape({Ashape.rows(), 1});
        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()});
        CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()});
        CT::Array<T> y({2, 3, yshape.rows(), yshape.cols()});
        A.setRandom(-100, 100);
        x.setRandom(-100, 100);
        A.updateDevice();
        x.updateDevice().wait();
        CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait();
        y.updateHost().wait();
        double norm = 0;
        CT::Array<T> yTest(yshape, true);
        for (int i = 0; i < 2; ++i) {
            for (int j = 0; j < 3; ++j) {
                yTest.eigenMap() = A[i][j].eigenMap() * x[i][j].eigenMap();
                norm += (yTest.eigenMap() - y[i][j].eigenMap()).norm();
            }
        }
        std::ostringstream msg;
        msg << "Matrix Shape: " << Ashape << ", "
            << "Residual: " << norm;
        TEST(norm < thres, "GEMV Broadcast", msg.str().c_str());
        return failed;
    };
    static uint32_t GEMM(int attempts) {
        uint32_t failed = 0;
        for (int i = 0; i < attempts; i++) {
            CT::Shape Ashape = makeRandom2DShape();
            CT::Shape Bshape = makeRandom2DShape();
            Bshape = CT::Shape({Ashape.cols(), Bshape.cols()});
            CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()});
            CT::Array<T> A(Ashape);
            CT::Array<T> B(Bshape);
            CT::Array<T> C(Cshape);
            A.setRandom(-100, 100);
            B.setRandom(-100, 100);
            C.setRandom(-100, 100);
            A.updateDevice();
            B.updateDevice();
            C.updateDevice().wait();
            CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait();
            C.updateHost().wait();
            CT::Array<T> CTest(Cshape, true);
            CTest.eigenMap() = A.eigenMap() * B.eigenMap();
            double norm = (CTest.eigenMap() - C.eigenMap()).norm();
            std::ostringstream name;
            name << "GEMM (" << i + 1 << "/" << attempts << ")";
            std::ostringstream msg;
            msg << "Matrix Shapes: " << Ashape << Bshape << ", "
                << "Residual: " << norm;
            TEST(norm < thres, name.str().c_str(), msg.str().c_str());
        }
        return failed;
    };
    static uint32_t GEMMBroadcast() {
        uint32_t failed = 0;
        CT::Shape Ashape = makeRandom2DShape();
        CT::Shape Bshape = makeRandom2DShape();
        Bshape = CT::Shape({Ashape.cols(), Bshape.cols()});
        CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()});
        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()});
        CT::Array<T> B({2, 3, Bshape.rows(), Bshape.cols()});
        CT::Array<T> C({2, 3, Cshape.rows(), Cshape.cols()});
        A.setRandom(-100, 100);
        B.setRandom(-100, 100);
        A.updateDevice();
        B.updateDevice();
        C.updateDevice().wait();
        CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait();
        C.updateHost().wait();
        double norm = 0;
        CT::Array<T> CTest(Cshape, true);
        for (int i = 0; i < 2; ++i) {
            for (int j = 0; j < 3; ++j) {
                CTest.eigenMap() = A[i][j].eigenMap() * B[i][j].eigenMap();
                norm += (CTest.eigenMap() - C[i][j].eigenMap()).norm();
            }
        }
        std::ostringstream msg;
        msg << "Matrix Shapes: " << Ashape << Bshape << ", "
            << "Residual: " << norm;
        TEST(norm < thres, "GEMM Broadcast", msg.str().c_str());
        return failed;
    };
    static uint32_t PLU() {
        uint32_t failed = 0;
        CT::Shape Ashape = makeRandom2DShape();
        CT::Shape xshape = makeRandom2DShape();
        Ashape = CT::Shape({Ashape.rows(), Ashape.rows()});
        xshape = CT::Shape({Ashape.rows(), xshape.cols()});
        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.rows()});
        CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()});
        CT::Array<T> b({2, 3, xshape.rows(), xshape.cols()});
        CT::Array<T> Ax({2, 3, xshape.rows(), xshape.cols()});
        A.setRandom(-100, 100);
        b.setRandom(-100, 100);
        CT::Array<T> LU(A.copy());
        x = b;
        A.updateDevice();
        LU.updateDevice();
        x.updateDevice().wait();
        CT::BLAS::PLUBatch<T> luBatch(LU);
        CT::BLAS::Batch<T> xBatch(x);
        luBatch.computeLU().wait();
        luBatch.solve(xBatch).wait();
        // Compute Ax and compare difference.
        CT::BLAS::GEMM<T>(1.0, A, x, 0.0, Ax).wait();
        Ax.updateHost();
        double norm = 0;
        for (int i = 0; i < 2; ++i) {
            for (int j = 0; j < 3; ++j) {
                norm += (Ax[i][j].eigenMap() - b[i][j].eigenMap()).norm();
            }
        }
        std::ostringstream msg;
        msg << "Matrix Shape: " << Ashape << xshape << ", "
            << "Residual: " << norm;
        TEST(norm < thres, "PLU/Solve", msg.str().c_str());
        return failed;
    }
 };
 template <> double BLASTests<float>::thres = 10e-1;
 template <> double BLASTests<double>::thres = 10e-8;
 uint32_t doMacroTests() {
    uint32_t failed = 0;
    failed += MacroTests::Kernel();
    failed += MacroTests::Class();
    std::cout << "\n";
    return failed;
 }
 template <typename T> uint32_t doArrayTests() {
    uint32_t failed = 0;
    std::cout << boxSmall("Index Tests : " + type<T>()) << "\n";
    failed += ArrayTests<T>::Indexing();
    std::cout << "\n" << boxSmall("Slice Tests : " + type<T>()) << "\n";
    failed += ArrayTests<T>::Slicing();
    std::cout << "\n";
    return failed;
 }
 template <typename T> uint32_t doBLASTests() {
    uint32_t failed = 0;
    std::cout << boxSmall("GEMV Tests : " + type<T>()) << "\n";
    failed += BLASTests<T>::GEMV(5);
    failed += BLASTests<T>::GEMVBroadcast();
    std::cout << "\n" << boxSmall("GEMM Tests : " + type<T>()) << "\n";
    failed += BLASTests<T>::GEMM(5);
    failed += BLASTests<T>::GEMMBroadcast();
    std::cout << "\n" << boxSmall("PLU Tests : " + type<T>()) << "\n";
    failed += BLASTests<T>::PLU();
    std::cout << "\n";
    return failed;
 }
 int main() {
    uint32_t failed = 0;
    std::cout << box("Macro Tests") << "\n";
    failed += doMacroTests();
    std::cout << box("Array Tests") << "\n";
    // Test different sizes.
    failed += doArrayTests<uint8_t>();
    failed += doArrayTests<int16_t>();
    failed += doArrayTests<int32_t>();
    failed += doArrayTests<double>();
    std::cout << box("BLAS Tests") << "\n";
    failed += doBLASTests<float>();
    failed += doBLASTests<double>();
    constexpr uint32_t tests = 2 + 4 * 5 + 13 * 2;
    std::ostringstream msg;
    msg << ((failed == 0) ? "\033[1;32mPASS \033[0m(" : "\033[1;31mFAIL \033[0m(")
        << (tests - failed) << "/" << tests << ")";
    std::cout << box2(msg.str()) << "\n";
    return 0;
 }
		`@ -0,0 +1,2 @@`
							`doxygen docs/Doxyfile`
							`sphinx-build -b html docs/source docs/build/html`