commit
						b4e4a49d44
					
				
				 26 changed files with 6120 additions and 0 deletions
			
			
		@ -0,0 +1,10 @@ | 
				
			||||
--- | 
				
			||||
BasedOnStyle: LLVM | 
				
			||||
IndentWidth: 4 | 
				
			||||
ColumnLimit: 100 | 
				
			||||
AllowShortIfStatementsOnASingleLine: true | 
				
			||||
--- | 
				
			||||
Language: Cpp | 
				
			||||
DerivePointerAlignment: false | 
				
			||||
PointerAlignment: Left | 
				
			||||
--- | 
				
			||||
@ -0,0 +1,4 @@ | 
				
			||||
build | 
				
			||||
*CPU | 
				
			||||
*GPU | 
				
			||||
.venv | 
				
			||||
@ -0,0 +1,777 @@ | 
				
			||||
#ifndef ARRAY_H | 
				
			||||
#define ARRAY_H | 
				
			||||
 | 
				
			||||
#include "Core.h" | 
				
			||||
#include "Macros.h" | 
				
			||||
#include <Eigen/Dense> | 
				
			||||
#include <iomanip> | 
				
			||||
#include <math.h> | 
				
			||||
#include <random> | 
				
			||||
#include <type_traits> | 
				
			||||
 | 
				
			||||
#ifdef DEVICE | 
				
			||||
#define POINTER pDevice | 
				
			||||
#else | 
				
			||||
#define POINTER pHost | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
namespace CudaTools { | 
				
			||||
 | 
				
			||||
template <typename T> | 
				
			||||
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>; | 
				
			||||
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>; | 
				
			||||
template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>; | 
				
			||||
 | 
				
			||||
template <typename T> struct EigenAdaptConst { typedef EigenMapMat<T> type; }; | 
				
			||||
template <typename T> struct EigenAdaptConst<const T> { typedef ConstEigenMapMat<T> type; }; | 
				
			||||
 | 
				
			||||
#define ENABLE_IF(X) std::enable_if_t<X, bool> | 
				
			||||
#define IS_INT(T) std::is_integral<T>::value | 
				
			||||
#define IS_FLOAT(T) std::is_floating_point<T>::value | 
				
			||||
#define IS_NUM(T) IS_INT(T) or IS_FLOAT(T) | 
				
			||||
 | 
				
			||||
template <typename T> class Array; | 
				
			||||
using Slice = std::pair<uint32_t, uint32_t>; | 
				
			||||
 | 
				
			||||
template <typename T> class ArrayIterator { | 
				
			||||
  private: | 
				
			||||
    template <typename U> | 
				
			||||
    friend std::ostream& operator<<(std::ostream& out, const ArrayIterator<U>& it); | 
				
			||||
    T* pData; | 
				
			||||
    Shape mShape; | 
				
			||||
    uint32_t mIndices[CUDATOOLS_ARRAY_MAX_AXES] = {0}; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    HD ArrayIterator(T* p, const Shape& shape) : pData(p), mShape(shape){}; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Moves the iterator to the next value. | 
				
			||||
     */ | 
				
			||||
    HD void next() { | 
				
			||||
        bool carry = false; | 
				
			||||
        uint32_t offset = 0; | 
				
			||||
        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { | 
				
			||||
            if (mIndices[iAxis] == mShape.dim(iAxis) - 1) { | 
				
			||||
                mIndices[iAxis] = 0; | 
				
			||||
                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); | 
				
			||||
                carry = true; | 
				
			||||
            } else { | 
				
			||||
                pData += mShape.stride(iAxis); | 
				
			||||
                mIndices[iAxis] += 1; | 
				
			||||
                carry = false; | 
				
			||||
            } | 
				
			||||
 | 
				
			||||
            if (not carry) { | 
				
			||||
                pData -= offset; | 
				
			||||
                return; | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
        pData += 1; // "Overflow" occured, so we reached end of array.
 | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Moves the iterator to the previous value. | 
				
			||||
     */ | 
				
			||||
    HD void prev() { | 
				
			||||
        bool carry = false; | 
				
			||||
        uint32_t offset = 0; | 
				
			||||
        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { | 
				
			||||
            if (mIndices[iAxis] == 0) { | 
				
			||||
                mIndices[iAxis] = mShape.dim(iAxis) - 1; | 
				
			||||
                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); | 
				
			||||
                carry = true; | 
				
			||||
            } else { | 
				
			||||
                pData -= mShape.stride(iAxis); | 
				
			||||
                mIndices[iAxis] += 1; | 
				
			||||
                carry = false; | 
				
			||||
            } | 
				
			||||
            if (not carry) { | 
				
			||||
                pData += offset; | 
				
			||||
                return; | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
        pData -= 1; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Moves the iterator a specified value away. | 
				
			||||
     * \param amount the amount to advance by | 
				
			||||
     */ | 
				
			||||
    HD void advance(const int32_t amount) { | 
				
			||||
        if (amount < 0) { | 
				
			||||
            for (uint32_t i = 0; i < abs(amount); ++i) { | 
				
			||||
                prev(); | 
				
			||||
            } | 
				
			||||
        } else { | 
				
			||||
            for (uint32_t i = 0; i < abs(amount); ++i) { | 
				
			||||
                next(); | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    HD void operator++() { next(); }; /**< Prefix increment operator. */ | 
				
			||||
    HD void operator--() { prev(); }; /**< Prefix decrement operator. */ | 
				
			||||
 | 
				
			||||
    /**< Addition operator. */ | 
				
			||||
    HD ArrayIterator<T> operator+(const int32_t v) const { | 
				
			||||
        ArrayIterator<T> it = *this; | 
				
			||||
        it.advance(v); | 
				
			||||
        return it; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /** Subtraction operator.*/ | 
				
			||||
    HD ArrayIterator<T> operator-(const int32_t v) const { | 
				
			||||
        ArrayIterator<T> it = *this; | 
				
			||||
        it.advance(-v); | 
				
			||||
        return it; | 
				
			||||
    }; | 
				
			||||
    HD void operator+=(const int32_t v) { advance(v); }; | 
				
			||||
    HD void operator-=(const int32_t v) { advance(-v); }; | 
				
			||||
 | 
				
			||||
    HD T& operator*() { return *pData; };             /**< Dereference operator. */ | 
				
			||||
    HD const T& operator*() const { return *pData; }; /**< Const dereference operator. */ | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Equals operator. | 
				
			||||
     */ | 
				
			||||
    HD bool operator==(const ArrayIterator<T>& it) { return pData == it.pData; } | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Not equals operator. | 
				
			||||
     */ | 
				
			||||
    HD bool operator!=(const ArrayIterator<T>& it) { return pData != it.pData; } | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
template <typename T> std::ostream& operator<<(std::ostream& out, const ArrayIterator<T>& it) { | 
				
			||||
    return out << it.pData; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
template <typename T> class ArrayLoader { | 
				
			||||
  private: | 
				
			||||
    ArrayIterator<T> mIterator; | 
				
			||||
    ArrayIterator<T> mIteratorEnd; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    HD ArrayLoader(const ArrayIterator<T>& it, const ArrayIterator<T>& it_end) | 
				
			||||
        : mIterator(it), mIteratorEnd(it_end){}; | 
				
			||||
    HD ArrayLoader &operator,(const T value) { | 
				
			||||
        CT_ERROR_IF(mIterator, ==, mIteratorEnd, "Cannot assign more values than Array size"); | 
				
			||||
        *mIterator = value; | 
				
			||||
        ++mIterator; | 
				
			||||
        return *this; | 
				
			||||
    } | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * A container that holds a N-dimensional array, stored column major. To set the | 
				
			||||
 * maximum N, there is a compiler macro CUDATOOLS_ARRAY_MAX_DIM whose default value is 4. | 
				
			||||
 * It adapts to operations between host and device to ease memory management. | 
				
			||||
 */ | 
				
			||||
template <typename T> class Array { | 
				
			||||
  private: | 
				
			||||
    template <typename U> friend std::ostream& operator<<(std::ostream&, const Array<U>&); | 
				
			||||
 | 
				
			||||
    Shape mShape; | 
				
			||||
    T* pHost = nullptr; | 
				
			||||
    T* pDevice = nullptr; | 
				
			||||
 | 
				
			||||
    bool mIsView = false; | 
				
			||||
    bool mIsSlice = false; | 
				
			||||
 | 
				
			||||
    uint32_t mEndOffset = 0; | 
				
			||||
 | 
				
			||||
    void freeArrays() { | 
				
			||||
#ifndef DEVICE | 
				
			||||
        if (not mIsView) { | 
				
			||||
            if (pDevice != nullptr) CudaTools::free(pDevice); | 
				
			||||
            if (pHost != nullptr) delete[] pHost; | 
				
			||||
        } | 
				
			||||
#endif | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    HD void calcEnd() { | 
				
			||||
        uint32_t offset = 0; | 
				
			||||
        for (uint32_t i = 0; i < shape().axes(); ++i) { | 
				
			||||
            offset += (shape().dim(i) - 1) * shape().stride(i); | 
				
			||||
        } | 
				
			||||
        mEndOffset = offset + 1; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    HD Array() = default; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructor for an Array that creates an allocates an array with | 
				
			||||
     * the specified Shape. Construction in this format is disabled on the device. | 
				
			||||
     * \brief Host only | 
				
			||||
     * \param shape the shape of the array | 
				
			||||
     * \param noDevice whether to initialize the array on the device | 
				
			||||
     */ | 
				
			||||
    Array(const Shape& shape, const bool noDevice = false) : mShape(shape), mIsView(false) { | 
				
			||||
        pHost = new T[shape.items()]; | 
				
			||||
        calcEnd(); | 
				
			||||
        if (noDevice) return; | 
				
			||||
        pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructor for an Array from an existing (preallocated) pointer. | 
				
			||||
     * \param pointer the pointer to use | 
				
			||||
     * \param shape the shape of the array | 
				
			||||
     * \param noDevice whether to initialize the array on the device | 
				
			||||
     */ | 
				
			||||
    HD Array(T* const pointer, const Shape& shape, const bool noDevice = false) | 
				
			||||
        : mShape(shape), mIsView(true), mIsSlice(false) { | 
				
			||||
        POINTER = pointer; | 
				
			||||
        calcEnd(); | 
				
			||||
#ifndef DEVICE | 
				
			||||
        if (noDevice) return; | 
				
			||||
        pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); | 
				
			||||
#endif | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructor for making a Array view from another Array, | 
				
			||||
     * given an offset and shape. | 
				
			||||
     * \param arr the original Array | 
				
			||||
     * \param shape the shape of the new array | 
				
			||||
     * \param offset the index where to start the a view of the array | 
				
			||||
     */ | 
				
			||||
    HD Array(const Array& arr, const Shape& shape, const uint32_t offset = 0) | 
				
			||||
        : mShape(shape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(true), | 
				
			||||
          mIsSlice(arr.mIsSlice) { | 
				
			||||
        calcEnd(); | 
				
			||||
        if (pHost != nullptr) pHost += offset; | 
				
			||||
        if (pDevice != nullptr) pDevice += offset; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * The copy-constructor for a Array. If this is not a view, a deep copy | 
				
			||||
     * of the data will be performed on both host and device. On the device, it is always | 
				
			||||
     * treated like a view. | 
				
			||||
     */ | 
				
			||||
    HD Array(const Array& arr) : mShape(arr.mShape), mIsView(arr.mIsView), mIsSlice(arr.mIsSlice) { | 
				
			||||
        calcEnd(); | 
				
			||||
        if (mIsView) { // If the other array was a view (and now this one), just assign.
 | 
				
			||||
            pHost = arr.pHost; | 
				
			||||
            pDevice = arr.pDevice; | 
				
			||||
            return; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        // Otherwise, we assume this is needs to own data.
 | 
				
			||||
        pHost = new T[mShape.items()]; | 
				
			||||
        auto arr_it = arr.begin(); | 
				
			||||
        for (auto it = begin(); it != end(); ++it) { | 
				
			||||
            *it = *arr_it; | 
				
			||||
            ++arr_it; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
#ifndef DEVICE | 
				
			||||
        if (arr.pDevice != nullptr) { | 
				
			||||
            pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); | 
				
			||||
        } | 
				
			||||
#endif | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     *  The move-constructor for a Array. | 
				
			||||
     */ | 
				
			||||
    HD Array(Array&& arr) | 
				
			||||
        : mShape(arr.mShape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(arr.mIsView), | 
				
			||||
          mIsSlice(arr.mIsSlice) { | 
				
			||||
        calcEnd(); | 
				
			||||
        // Make other object empty.
 | 
				
			||||
        arr.pHost = nullptr; | 
				
			||||
        arr.pDevice = nullptr; | 
				
			||||
        arr.mIsView = true; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    HD ~Array() { freeArrays(); }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * The copy-assignment operator for a Array. If this is not a view, | 
				
			||||
     * then the currently owned data will be freed, and a deep copy of the data will | 
				
			||||
     * be performed on both host and device. On the device, it is always treated like a view. | 
				
			||||
     */ | 
				
			||||
    HD Array& operator=(const Array& arr) { | 
				
			||||
        if (this == &arr) return *this; | 
				
			||||
 | 
				
			||||
        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
 | 
				
			||||
            auto arr_it = arr.begin(); | 
				
			||||
            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { | 
				
			||||
                *it = *arr_it; | 
				
			||||
                ++arr_it; | 
				
			||||
            } | 
				
			||||
            return *this; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        // Otherwise, it is implied to be object reassignment.
 | 
				
			||||
        mShape = arr.mShape; | 
				
			||||
        mIsView = arr.mIsView; | 
				
			||||
        mIsSlice = arr.mIsSlice; | 
				
			||||
        calcEnd(); | 
				
			||||
 | 
				
			||||
        // Regardless if the right-hand side is a view, we create a new copy.
 | 
				
			||||
        // In case that the right-hand side is a view of this array, we
 | 
				
			||||
        // allocate memory to copy first. Keep in mind that the right-hand side
 | 
				
			||||
        // array will then become undefined.
 | 
				
			||||
 | 
				
			||||
        // We can only do this on the host.
 | 
				
			||||
#ifndef DEVICE | 
				
			||||
        T* new_pDevice = nullptr; | 
				
			||||
        if (pDevice != nullptr) { | 
				
			||||
            new_pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        T* new_pHost = new T[mShape.items()]; | 
				
			||||
        memcpy(new_pHost, arr.pHost, mShape.items() * sizeof(T)); | 
				
			||||
 | 
				
			||||
        freeArrays(); | 
				
			||||
        pHost = new_pHost; | 
				
			||||
        pDevice = new_pDevice; | 
				
			||||
#else | 
				
			||||
        pHost = arr.pHost; | 
				
			||||
        pDevice = arr.pDevice; | 
				
			||||
#endif | 
				
			||||
        return *this; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * The move-assignment operator for a Array. | 
				
			||||
     */ | 
				
			||||
    HD Array& operator=(Array&& arr) { | 
				
			||||
        if (this == &arr) return *this; | 
				
			||||
 | 
				
			||||
        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
 | 
				
			||||
            auto arr_it = arr.begin(); | 
				
			||||
            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { | 
				
			||||
                *it = *arr_it; | 
				
			||||
                ++arr_it; | 
				
			||||
            } | 
				
			||||
            return *this; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        CT_ERROR(arr.mIsView, | 
				
			||||
                 "Cannot move-assign view to a non-view (owner). This would lead to undefined " | 
				
			||||
                 "behavior."); | 
				
			||||
 | 
				
			||||
        // Otherwise, it is implied to be object reassignment.
 | 
				
			||||
        freeArrays(); | 
				
			||||
        mShape = arr.mShape; | 
				
			||||
        pHost = arr.pHost; | 
				
			||||
        pDevice = arr.pDevice; | 
				
			||||
        mIsView = arr.mIsView; | 
				
			||||
        mIsSlice = arr.mIsSlice; | 
				
			||||
        calcEnd(); | 
				
			||||
 | 
				
			||||
        // Make other array empty.
 | 
				
			||||
        arr.pHost = nullptr; | 
				
			||||
        arr.pDevice = nullptr; | 
				
			||||
        arr.mIsView = true; | 
				
			||||
        return *this; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Used for indexing the Array. | 
				
			||||
     * \param index index of the first dimension | 
				
			||||
     */ | 
				
			||||
    HD Array operator[](const uint32_t index) const { | 
				
			||||
        CT_ERROR_IF(index, >=, shape().dim(0), "Index exceeds axis size"); | 
				
			||||
        return Array(*this, shape().subshape(1), index * shape().stride(0)); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Used for indexing the Array. | 
				
			||||
     * \param indices a list of indices to index the Array | 
				
			||||
     */ | 
				
			||||
    HD Array operator[](const std::initializer_list<uint32_t> indices) const { | 
				
			||||
        CT_ERROR_IF(indices.size(), >, shape().axes(), | 
				
			||||
                    "Number of indices cannot exceed number of axes"); | 
				
			||||
        auto it = indices.begin(); | 
				
			||||
        uint offset = 0; | 
				
			||||
        for (uint32_t i = 0; i < indices.size(); ++i) { | 
				
			||||
            uint32_t index = *it; | 
				
			||||
            CT_ERROR_IF(index, >=, shape().dim(i), "Index exceeds axis size"); | 
				
			||||
            offset += index * shape().stride(i); | 
				
			||||
            ++it; | 
				
			||||
        } | 
				
			||||
        return Array(*this, shape().subshape(indices.size()), offset); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    HD ArrayLoader<T> operator<<(const T value) { | 
				
			||||
        auto it = begin(); | 
				
			||||
        *it = value; | 
				
			||||
        ++it; | 
				
			||||
        return ArrayLoader<T>(it, end()); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    HD T operator=(const T& value) { return POINTER[0] = value; }; | 
				
			||||
    HD operator T&() { return POINTER[0]; }; | 
				
			||||
    HD operator const T&() const { return POINTER[0]; }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Used to create slices of the Array. | 
				
			||||
     * \param slices a list of slices to slice the Array | 
				
			||||
     */ | 
				
			||||
    HD Array slice(const std::initializer_list<Slice> slices) const { | 
				
			||||
        CT_ERROR_IF(slices.size(), >, shape().axes(), | 
				
			||||
                    "Number of slices cannot exceed number of axes"); | 
				
			||||
 | 
				
			||||
        uint offset = 0; | 
				
			||||
        Shape new_shape = mShape; | 
				
			||||
        auto it = slices.begin(); | 
				
			||||
        for (uint32_t i = 0; i < slices.size(); ++i) { | 
				
			||||
            uint32_t from_index = it->first; | 
				
			||||
            uint32_t to_index = it->second; | 
				
			||||
            CT_ERROR_IF(from_index, >, to_index, | 
				
			||||
                        "Slice start cannot be greater than than slice end"); | 
				
			||||
            CT_ERROR_IF(from_index, >=, shape().dim(i), "Slice start exceeds axis size"); | 
				
			||||
            CT_ERROR_IF(to_index - 1, >=, shape().dim(i), "Slice end exceeds axis size"); | 
				
			||||
 | 
				
			||||
            offset += from_index * shape().stride(i); | 
				
			||||
            new_shape.mAxisDim[i] = to_index - from_index; | 
				
			||||
            ++it; | 
				
			||||
        } | 
				
			||||
        new_shape.mItems = 1; | 
				
			||||
        for (uint32_t i = 0; i < shape().axes(); ++i) { | 
				
			||||
            new_shape.mItems *= new_shape.dim(i); | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        Array<T> arr(*this, new_shape, offset); | 
				
			||||
        arr.mIsSlice = true; | 
				
			||||
        return arr; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns this Array with a different Shape. Its self assigning version is reshape. | 
				
			||||
     * If this Array is a slice of another, then it will perform a deep copy, and return | 
				
			||||
     * a new non-view array. | 
				
			||||
     */ | 
				
			||||
    HD Array reshaped(const Shape& new_shape) const { | 
				
			||||
        CT_ERROR_IF(shape().items(), !=, new_shape.items(), | 
				
			||||
                    "New shape cannot have a different number of terms"); | 
				
			||||
        if (mIsSlice) { | 
				
			||||
            Array<T> arr = this->copy(); | 
				
			||||
            return arr.reshaped(new_shape); | 
				
			||||
        } | 
				
			||||
        Array<T> arr = view(); | 
				
			||||
        arr.mShape = new_shape; | 
				
			||||
        return arr; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    HD void reshape(const Shape& new_shape) { | 
				
			||||
        CT_ERROR_IF(shape().items(), !=, new_shape.items(), | 
				
			||||
                    "New shape cannot have a different number of terms"); | 
				
			||||
        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try reshaped instead)") | 
				
			||||
        mShape = new_shape; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets a view that is has at least two dimensions. Useful for promoting | 
				
			||||
     * single vectors to their 2D counterparts. | 
				
			||||
     */ | 
				
			||||
    HD Array atLeast2D() const { | 
				
			||||
        return (shape().axes() == 1) ? Array(*this, {shape().length(), 1}) : view(); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Flattens the Array into one dimension. | 
				
			||||
     */ | 
				
			||||
    HD Array flatten() const { return reshape({mShape.mItems}); }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns the Eigen::Map of this Array. | 
				
			||||
     */ | 
				
			||||
    typename EigenAdaptConst<T>::type eigenMap() const { | 
				
			||||
        uint32_t total_dim = mShape.mAxes; | 
				
			||||
        CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices") | 
				
			||||
        CT_ERROR_IF(total_dim, !=, 2, | 
				
			||||
                    "Mapping to an Eigen array can only occur on two-dimensional arrays"); | 
				
			||||
        return typename EigenAdaptConst<T>::type(POINTER, mShape.rows(), mShape.cols()); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets the Shape of the Array. | 
				
			||||
     */ | 
				
			||||
    HD Shape shape() const { return mShape; }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets the pointer to this array, depending on host or device. | 
				
			||||
     */ | 
				
			||||
    HD T* data() const { return POINTER; }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns the device pointer regardless of host or device. | 
				
			||||
     */ | 
				
			||||
    HD T* dataDevice() const { return pDevice; }; | 
				
			||||
 | 
				
			||||
    HD bool isView() const { return mIsView; };   /**< Gets whether this Array is a view. */ | 
				
			||||
    HD bool isSlice() const { return mIsSlice; }; /**< Gets whether this Array is a slice. */ | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets a view of this Array. | 
				
			||||
     */ | 
				
			||||
    HD Array view() const { return Array(*this, mShape); } | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Copies this Array and returns a new Array with the same memory. | 
				
			||||
     */ | 
				
			||||
    HD Array copy() const { | 
				
			||||
        Array<T> arr(mShape, (pDevice == nullptr)); | 
				
			||||
 | 
				
			||||
        auto arr_it = arr.begin(); | 
				
			||||
        for (auto it = begin(); it != end(); ++it) { | 
				
			||||
            *arr_it = *it; | 
				
			||||
            ++arr_it; | 
				
			||||
        } | 
				
			||||
#ifndef DEVICE | 
				
			||||
        if (pDevice != nullptr) { | 
				
			||||
            CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait(); | 
				
			||||
        } | 
				
			||||
#endif | 
				
			||||
        return arr; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets the iterator to the beginning of this Array. | 
				
			||||
     */ | 
				
			||||
    HD ArrayIterator<T> begin() const { return ArrayIterator<T>(POINTER, mShape); }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets the iterator to the end of this Array. | 
				
			||||
     */ | 
				
			||||
    HD ArrayIterator<T> end() const { return ArrayIterator<T>(POINTER + mEndOffset, mShape); }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Sets the values of the entire Array to a constant. This is restricted to numerical types. | 
				
			||||
     */ | 
				
			||||
    HD void setConstant(const T value) const { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        for (auto it = begin(); it != end(); ++it) { | 
				
			||||
            *it = value; | 
				
			||||
        } | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Sets the Array values with uniform random values in a specified range. This is restricted to | 
				
			||||
     * numerical types. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    void setRandom(const T min, const T max) const { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); | 
				
			||||
        std::random_device rd; | 
				
			||||
        std::mt19937 mt(rd()); | 
				
			||||
        if constexpr (IS_INT(T)) { | 
				
			||||
            std::uniform_int_distribution<T> dist(min, max); | 
				
			||||
            for (auto it = begin(); it != end(); ++it) { | 
				
			||||
                *it = dist(mt); | 
				
			||||
            } | 
				
			||||
        } else if constexpr (IS_FLOAT(T)) { | 
				
			||||
            std::uniform_real_distribution<T> dist(min, max); | 
				
			||||
            for (auto it = begin(); it != end(); ++it) { | 
				
			||||
                *it = dist(mt); | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Sets the Array values to start from a value and increment by a specified step. This is | 
				
			||||
     * restricted to numerical types. | 
				
			||||
     */ | 
				
			||||
    HD void setRange(T min, const T step = 1) const { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        for (auto it = begin(); it != end(); ++it) { | 
				
			||||
            *it = min; | 
				
			||||
            min += step; | 
				
			||||
        } | 
				
			||||
    } | 
				
			||||
    /**
 | 
				
			||||
     * Sets the Array values to be evenly spaced numbers over a given interval. This is restricted | 
				
			||||
     * to floating point types. | 
				
			||||
     */ | 
				
			||||
    HD void setLinspace(const T min, const T max) const { | 
				
			||||
        static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); | 
				
			||||
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); | 
				
			||||
        T i = 0; | 
				
			||||
        T d = max - min; | 
				
			||||
        T items = (T)(shape().items() - 1); | 
				
			||||
        for (auto it = begin(); it != end(); ++it) { | 
				
			||||
            *it = min + d * (i / items); | 
				
			||||
            i += 1; | 
				
			||||
        } | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns array of given shape with constant values. This is restricted to numerical types. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    static Array constant(const Shape& shape, const T value) { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        Array<T> arr(shape); | 
				
			||||
        arr.setConstant(value); | 
				
			||||
        return arr; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns array of given shape with random values in given interval. This is restricted to | 
				
			||||
     * numerical types. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    static Array random(const Shape& shape, const T min, const T max) { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        Array<T> arr(shape); | 
				
			||||
        arr.setRandom(min, max); | 
				
			||||
        return arr; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns evenly spaced values within a given interval. This is restricted to numerical types. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    static Array range(const T min, const T max, const T step = 1) { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); | 
				
			||||
        Array<T> arr({(uint32_t)((max - min) / step)}); | 
				
			||||
        arr.setRange(min, step); | 
				
			||||
        return arr; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns evenly spaced values within a given interval. This is restricted to floating point | 
				
			||||
     * types. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    static Array linspace(const T min, const T max, const uint32_t size) { | 
				
			||||
        static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); | 
				
			||||
        Array<T> arr({size}); | 
				
			||||
        arr.setLinspace(min, max); | 
				
			||||
        return arr; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Transposes the internal data and returns the corresponding new Array. | 
				
			||||
     * Its self assigning version is transpose. This is restricted to numerical types. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    Array transposed() const { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); | 
				
			||||
        Array<T> new_arr({mShape.rows(), mShape.cols()}); | 
				
			||||
        new_arr.eigenMap() = this->eigenMap().transpose().eval(); | 
				
			||||
        return new_arr; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Transposes the intenal data. Its self assigning version is transpose. | 
				
			||||
     * This is restricted to numerical types. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    void transpose() { | 
				
			||||
        static_assert(IS_NUM(T), "Function only available on numeric types."); | 
				
			||||
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); | 
				
			||||
        Array<T> new_arr(*this, {mShape.cols(), mShape.rows()}); | 
				
			||||
        new_arr.eigenMap() = this->eigenMap().transpose().eval(); | 
				
			||||
        mShape = Shape({mShape.cols(), mShape.rows()}); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    void inverse() const { | 
				
			||||
        static_assert(IS_FLOAT(T), "Function only available on floating numeric types."); | 
				
			||||
        CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays"); | 
				
			||||
        CT_ERROR_IF(shape().rows(), !=, shape().cols(), | 
				
			||||
                    "Inverse can only occur on square matrices"); | 
				
			||||
        Array<T> inv(shape()); | 
				
			||||
        inv.eigenMap() = this->eigenMap().inverse(); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Pins the memory (page locks) for faster memory transfer in concurrent | 
				
			||||
     * transfers. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    void pinMemory() const { CudaTools::pin(pHost, mShape.items() * sizeof(T)); }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Updates the host copy by copying the device data back to the host. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const { | 
				
			||||
        CT_ERROR(mIsView, "Cannot update host on a view"); | 
				
			||||
        CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream); | 
				
			||||
        return stream; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Updates the device copy by copying the host data to the device. | 
				
			||||
     * \brief Host only | 
				
			||||
     */ | 
				
			||||
    StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const { | 
				
			||||
        CT_ERROR(mIsView, "Cannot update device on a view"); | 
				
			||||
        CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream); | 
				
			||||
        return stream; | 
				
			||||
    }; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
template <typename T> | 
				
			||||
void printAxis(std::ostream& out, const Array<T>& arr, const uint32_t axis, size_t width) { | 
				
			||||
    std::string space = std::string(2 * axis, ' '); | 
				
			||||
    if (arr.shape().axes() == 1) { | 
				
			||||
        out << "["; | 
				
			||||
        for (uint32_t i = 0; i < arr.shape().items(); ++i) { | 
				
			||||
            if constexpr (std::is_floating_point<T>::value) { | 
				
			||||
                out << std::scientific << std::setprecision(6); | 
				
			||||
            } | 
				
			||||
            if (width == 0) { | 
				
			||||
                out << ((i == 0) ? "" : " "); | 
				
			||||
            } else { | 
				
			||||
                out << std::setw((i == 0) ? width - 1 : width); | 
				
			||||
            } | 
				
			||||
            out << (T)arr[i] << ((i == arr.shape().items() - 1) ? "]" : ","); | 
				
			||||
        } | 
				
			||||
    } else if (arr.shape().axes() == 2) { | 
				
			||||
        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { | 
				
			||||
            out << space << ((i == 0) ? "[" : " "); | 
				
			||||
            printAxis(out, arr[i], axis + 1, width); | 
				
			||||
            out << ((i == arr.shape().dim(0) - 1) ? "]" : ",\n"); | 
				
			||||
        } | 
				
			||||
    } else { | 
				
			||||
        out << space << "[\n"; | 
				
			||||
        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { | 
				
			||||
            printAxis(out, arr[i], axis + 1, width); | 
				
			||||
            out << ((i == arr.shape().dim(0) - 1) ? "\n" : ",\n\n"); | 
				
			||||
        } | 
				
			||||
        out << space << "]"; | 
				
			||||
    } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>& arr) { | 
				
			||||
    size_t width = 0; | 
				
			||||
    if constexpr (IS_NUM(T)) { | 
				
			||||
        T max_val = 0; | 
				
			||||
        bool negative = false; | 
				
			||||
        for (auto it = arr.begin(); it != arr.end(); ++it) { | 
				
			||||
            if (*it < 0) negative = true; | 
				
			||||
            max_val = (abs(*it) > max_val) ? abs(*it) : max_val; | 
				
			||||
        } | 
				
			||||
        width = std::to_string(max_val).size() + 1; | 
				
			||||
        width += (negative) ? 1 : 0; | 
				
			||||
    } else if constexpr (IS_FLOAT(T)) { | 
				
			||||
        T max_val = 0; | 
				
			||||
        bool negative = false; | 
				
			||||
        for (auto it = arr.begin(); it != arr.end(); ++it) { | 
				
			||||
            if (*it < 0) negative = true; | 
				
			||||
            int exp = 0; | 
				
			||||
            frexp(*it, &exp); | 
				
			||||
            max_val = (exp > max_val) ? exp : max_val; | 
				
			||||
        } | 
				
			||||
        width = std::to_string(max_val).size() + 5; | 
				
			||||
        width += (negative) ? 1 : 0; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    printAxis<T>(out, arr, 0, (arr.shape().axes() == 1) ? 0 : width); | 
				
			||||
    return out; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
}; // namespace CudaTools
 | 
				
			||||
 | 
				
			||||
#endif // ARRAY_H
 | 
				
			||||
@ -0,0 +1,600 @@ | 
				
			||||
#ifndef BLAS_H | 
				
			||||
#define BLAS_H | 
				
			||||
 | 
				
			||||
#include "Array.h" | 
				
			||||
#include "Core.h" | 
				
			||||
#include "Macros.h" | 
				
			||||
 | 
				
			||||
namespace CudaTools { | 
				
			||||
 | 
				
			||||
namespace BLAS { | 
				
			||||
 | 
				
			||||
struct BatchInfo { | 
				
			||||
    uint32_t strideA, strideB, strideC; | 
				
			||||
    uint32_t size; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
template <typename T> struct Check { | 
				
			||||
    static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") { | 
				
			||||
        CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str()); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static void isSquare(const Array<T>& arr, const std::string& name = "Array") { | 
				
			||||
        isAtLeast2D(arr, name); | 
				
			||||
        CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str()) | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C, | 
				
			||||
                              const std::string& nameA = "A", const std::string& nameB = "B", | 
				
			||||
                              const std::string nameC = "C") { | 
				
			||||
        isAtLeast2D(A, nameA); | 
				
			||||
        isAtLeast2D(B, nameB); | 
				
			||||
        isAtLeast2D(C, nameB); | 
				
			||||
        CT_ERROR_IF(A.shape().cols(), !=, B.shape().rows(), | 
				
			||||
                    (nameA + nameB + " is not a valid matrix multiplication").c_str()); | 
				
			||||
 | 
				
			||||
        Shape ABshape({A.shape().rows(), B.shape().cols()}); | 
				
			||||
        Shape Cshape({C.shape().rows(), C.shape().cols()}); | 
				
			||||
 | 
				
			||||
        CT_ERROR_IF( | 
				
			||||
            ABshape, !=, Cshape, | 
				
			||||
            ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str()); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static uint32_t getUpperItems(const Array<T>& arr) { | 
				
			||||
        uint32_t upperItems = 1; | 
				
			||||
        for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) { | 
				
			||||
            upperItems *= arr.shape().dim(iAxis); | 
				
			||||
        } | 
				
			||||
        return upperItems; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static void matchUpperShape(const Array<T>& A, const Array<T>& B, | 
				
			||||
                                const std::string& nameA = "A", const std::string& nameB = "B") { | 
				
			||||
        CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(), | 
				
			||||
                    (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); | 
				
			||||
        for (uint32_t iAxis = 0; iAxis < A.shape().axes() - 2; ++iAxis) { | 
				
			||||
            uint32_t Adim = A.shape().dim(iAxis); | 
				
			||||
            uint32_t Bdim = B.shape().dim(iAxis); | 
				
			||||
            CT_ERROR_IF( | 
				
			||||
                Adim, !=, Bdim, | 
				
			||||
                (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); | 
				
			||||
        } | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C, | 
				
			||||
                                     const std::string& nameA = "A", const std::string& nameB = "B", | 
				
			||||
                                     const std::string nameC = "C") { | 
				
			||||
        isValidMatmul(A, B, C, nameA, nameB, nameC); | 
				
			||||
        uint32_t itemsA = getUpperItems(A); | 
				
			||||
        uint32_t itemsB = getUpperItems(B); | 
				
			||||
        uint32_t itemsC = getUpperItems(C); | 
				
			||||
 | 
				
			||||
        uint32_t Asize = A.shape().rows() * A.shape().cols(); | 
				
			||||
        uint32_t Bsize = B.shape().rows() * B.shape().cols(); | 
				
			||||
        uint32_t Csize = C.shape().rows() * C.shape().cols(); | 
				
			||||
 | 
				
			||||
        if (itemsA == itemsB) { | 
				
			||||
            CT_ERROR_IF(itemsA, !=, itemsC, | 
				
			||||
                        ("Incorrect dimensions to broadcast to output " + nameC).c_str()); | 
				
			||||
            matchUpperShape(A, B, nameA, nameB); | 
				
			||||
            matchUpperShape(A, C, nameA, nameC); | 
				
			||||
            return BatchInfo{Asize, Bsize, Csize, itemsC}; | 
				
			||||
        } else if (itemsA > itemsB) { | 
				
			||||
            CT_ERROR_IF( | 
				
			||||
                itemsB, !=, 1, | 
				
			||||
                ("Cannot broadcast operation to " + nameB + " with non-matching " + nameA).c_str()); | 
				
			||||
            CT_ERROR_IF(itemsA, !=, itemsC, | 
				
			||||
                        ("Incorrect dimensions to broadcast to output " + nameC).c_str()); | 
				
			||||
            matchUpperShape(A, C, nameA, nameC); | 
				
			||||
            return BatchInfo{Asize, 0, Csize, itemsC}; | 
				
			||||
        } else { | 
				
			||||
            CT_ERROR_IF( | 
				
			||||
                itemsA, !=, 1, | 
				
			||||
                ("Cannot broadcast operation to " + nameA + " with non-matching " + nameB).c_str()); | 
				
			||||
            CT_ERROR_IF(itemsA, !=, itemsC, | 
				
			||||
                        ("Incorrect dimensions to broadcast to output " + nameC).c_str()); | 
				
			||||
            matchUpperShape(B, C, nameB, nameC); | 
				
			||||
            return BatchInfo{0, Bsize, Csize, itemsC}; | 
				
			||||
        } | 
				
			||||
    }; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Represents a Batch of Arrays with the same shape. Mainly used for cuBLAS functions. | 
				
			||||
 */ | 
				
			||||
template <typename T> class Batch { | 
				
			||||
  protected: | 
				
			||||
    Array<T*> mBatch; | 
				
			||||
    Shape mShape; | 
				
			||||
 | 
				
			||||
    uint32_t mCount = 0; | 
				
			||||
    uint32_t mBatchSize; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    Batch() = delete; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructs a batch from a given size. | 
				
			||||
     */ | 
				
			||||
    Batch(const uint32_t size) : mBatchSize(size){}; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructs a batch from a non-view Array. | 
				
			||||
     */ | 
				
			||||
    Batch(const Array<T>& arr) { | 
				
			||||
        CT_ERROR(arr.isView(), "Array cannot be a view"); | 
				
			||||
        mShape = Shape({arr.shape().rows(), arr.shape().cols()}); | 
				
			||||
        mBatchSize = mCount = Check<T>::getUpperItems(arr); | 
				
			||||
 | 
				
			||||
        mBatch = Array<T*>({mBatchSize}); | 
				
			||||
 | 
				
			||||
        Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()}); | 
				
			||||
        for (uint32_t i = 0; i < mBatchSize; ++i) { | 
				
			||||
#ifdef CUDA | 
				
			||||
            mBatch[i] = batch[i].dataDevice(); | 
				
			||||
#else | 
				
			||||
            mBatch[i] = batch[i].data(); | 
				
			||||
#endif | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        mBatch.updateDevice().wait(); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Adds a matrix to the batch. Array must be a view. | 
				
			||||
     */ | 
				
			||||
    void add(const Array<T>& arr) { | 
				
			||||
        CT_ERROR(not arr.isView(), "Cannot add non-view Arrays"); | 
				
			||||
        CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays"); | 
				
			||||
#ifdef CUDA | 
				
			||||
        mBatch[mCount] = arr.dataDevice(); | 
				
			||||
#else | 
				
			||||
        mBatch[mCount] = arr.data(); | 
				
			||||
#endif | 
				
			||||
        if (mCount == 0) { | 
				
			||||
            mShape = arr.shape(); | 
				
			||||
            mBatchSize = mCount = Check<T>::getUpperItems(arr); | 
				
			||||
        } else { | 
				
			||||
            CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch"); | 
				
			||||
        } | 
				
			||||
        ++mCount; | 
				
			||||
 | 
				
			||||
        if (mCount == mBatchSize) { | 
				
			||||
            mBatch.updateDevice().wait(); | 
				
			||||
        } | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Indexing operator which returns a view of the Array in the Batch at the given index. | 
				
			||||
     */ | 
				
			||||
    Array<T> operator[](const uint32_t index) const { | 
				
			||||
        CT_ERROR_IF(index, >=, mBatchSize, "Index exceeds batch size"); | 
				
			||||
        return Array<T>(mBatch[index], {mShape.rows(), mShape.cols()}); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Returns the batch Array of pointers. | 
				
			||||
     */ | 
				
			||||
    Array<T*> batch() const { return mBatch.view(); }; | 
				
			||||
    Shape shape() const { return mShape; } /**< Gets the shape of the matrices in the batch. */ | 
				
			||||
    uint32_t size() const { return mBatchSize; }        /**< Gets the batch size.*/ | 
				
			||||
    bool full() const { return mBatchSize == mCount; }; /**< Gets if the batch is full. */ | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
////////////////
 | 
				
			||||
// cuBLAS API //
 | 
				
			||||
////////////////
 | 
				
			||||
 | 
				
			||||
template <typename T, typename F1, typename F2, typename... Args> | 
				
			||||
constexpr void invoke(F1 f1, F2 f2, Args&&... args) { | 
				
			||||
    if constexpr (std::is_same<T, float>::value) { | 
				
			||||
        CUBLAS_CHECK(f1(args...)); | 
				
			||||
    } else if constexpr (std::is_same<T, double>::value) { | 
				
			||||
        CUBLAS_CHECK(f2(args...)); | 
				
			||||
    } else { | 
				
			||||
        CT_ERROR(true, "BLAS functions are not callable with that type"); | 
				
			||||
    } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Computes the matrix-vector product: \f$ y = \alpha Ax + \beta y \f$. It will automatically | 
				
			||||
 * broadcast the operation if applicable. | 
				
			||||
 */ | 
				
			||||
template <typename T> | 
				
			||||
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y, | 
				
			||||
              const StreamID& stream = DEF_CUBLAS_STREAM) { | 
				
			||||
 | 
				
			||||
    BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y"); | 
				
			||||
    CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector"); | 
				
			||||
    CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector"); | 
				
			||||
 | 
				
			||||
    uint32_t rows = A.shape().rows(); | 
				
			||||
    uint32_t cols = A.shape().cols(); | 
				
			||||
    T a = alpha, b = beta; | 
				
			||||
#ifdef CUDA | 
				
			||||
    CUBLAS_CHECK( | 
				
			||||
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); | 
				
			||||
    if (bi.size == 1) { | 
				
			||||
        invoke<T>(cublasSgemv, cublasDgemv, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, | 
				
			||||
                  &a, A.dataDevice(), rows, x.dataDevice(), 1, &b, y.dataDevice(), 1); | 
				
			||||
 | 
				
			||||
    } else { // Greater than 2, so broadcast.
 | 
				
			||||
        invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, | 
				
			||||
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, &a, A.dataDevice(), rows, | 
				
			||||
                  bi.strideA, x.dataDevice(), 1, bi.strideB, &b, y.dataDevice(), 1, bi.strideC, | 
				
			||||
                  bi.size); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
#else | 
				
			||||
    if (bi.size == 1) { | 
				
			||||
        y.eigenMap() = a * (A.eigenMap() * x.eigenMap()) + b * y.eigenMap(); | 
				
			||||
    } else { // Greater than 2, so broadcast.
 | 
				
			||||
#pragma omp parallel for | 
				
			||||
        for (uint32_t i = 0; i < bi.size; ++i) { | 
				
			||||
            auto Ai = Array<T>(A, {rows, cols}, i * bi.strideA).eigenMap(); | 
				
			||||
            auto xi = Array<T>(x, {cols, 1}, i * bi.strideB).eigenMap(); | 
				
			||||
            auto yi = Array<T>(y, {rows, 1}, i * bi.strideC).eigenMap(); | 
				
			||||
            yi = a * (Ai * xi) + b * yi; | 
				
			||||
        } | 
				
			||||
    } | 
				
			||||
#endif | 
				
			||||
    return StreamID{stream}; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically | 
				
			||||
 * broadcast the operation if applicable. | 
				
			||||
 */ | 
				
			||||
template <typename T> | 
				
			||||
StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C, | 
				
			||||
              const StreamID& stream = DEF_CUBLAS_STREAM) { | 
				
			||||
 | 
				
			||||
    BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C"); | 
				
			||||
    // A is m x k, B is k x n.
 | 
				
			||||
    uint32_t m = A.shape().rows(); | 
				
			||||
    uint32_t k = A.shape().cols(); | 
				
			||||
    uint32_t n = B.shape().cols(); | 
				
			||||
 | 
				
			||||
    T a = alpha, b = beta; | 
				
			||||
#ifdef CUDA | 
				
			||||
    CUBLAS_CHECK( | 
				
			||||
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); | 
				
			||||
    if (bi.size == 1) { | 
				
			||||
        invoke<T>(cublasSgemm, cublasDgemm, Manager::get()->cublasHandle(), CUBLAS_OP_N, | 
				
			||||
                  CUBLAS_OP_N, m, n, k, &a, A.dataDevice(), m, B.dataDevice(), k, &b, | 
				
			||||
                  C.dataDevice(), m); | 
				
			||||
 | 
				
			||||
    } else { // Greater than 2, so broadcast.
 | 
				
			||||
        invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, | 
				
			||||
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &a, | 
				
			||||
                  A.dataDevice(), m, bi.strideA, B.dataDevice(), k, bi.strideB, &b, C.dataDevice(), | 
				
			||||
                  m, bi.strideC, bi.size); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
#else | 
				
			||||
    if (bi.size == 1) { | 
				
			||||
        C.eigenMap() = a * (A.eigenMap() * B.eigenMap()) + b * C.eigenMap(); | 
				
			||||
    } else { // Greater than 2, so broadcast.
 | 
				
			||||
#pragma omp parallel for | 
				
			||||
        for (uint32_t i = 0; i < bi.size; ++i) { | 
				
			||||
            auto Ai = Array<T>(A, {m, k}, i * bi.strideA).eigenMap(); | 
				
			||||
            auto Bi = Array<T>(B, {k, n}, i * bi.strideB).eigenMap(); | 
				
			||||
            auto Ci = Array<T>(C, {m, n}, i * bi.strideC).eigenMap(); | 
				
			||||
            Ci = a * (Ai * Bi) + b * Ci; | 
				
			||||
        } | 
				
			||||
    } | 
				
			||||
#endif | 
				
			||||
    return StreamID{stream}; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Computes the diagonal matrix multiplication: \f$ C = A\mathrm{diag}(X) \f$, or \f$ C = | 
				
			||||
 * \mathrm{diag}(X)A \f$ if left = true. | 
				
			||||
 */ | 
				
			||||
template <typename T> | 
				
			||||
StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const bool left = false, | 
				
			||||
              const StreamID& stream = DEF_CUBLAS_STREAM) { | 
				
			||||
    CT_ERROR_IF(X.shape().cols(), !=, 1, "'x' must be a column vector."); | 
				
			||||
    if (left) { | 
				
			||||
        CT_ERROR_IF(A.shape().rows(), !=, X.shape().rows(), | 
				
			||||
                    "Rows of 'A' and length of 'x' need to match."); | 
				
			||||
    } else { | 
				
			||||
        CT_ERROR_IF(A.shape().cols(), !=, X.shape().rows(), | 
				
			||||
                    "Columns of 'A' and length of 'x' need to match."); | 
				
			||||
    } | 
				
			||||
    CT_ERROR_IF(A.shape().rows(), !=, C.shape().rows(), | 
				
			||||
                "Rows of 'A' and rows() of 'C' need to  match."); | 
				
			||||
    CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(), | 
				
			||||
                "Rows of 'A' and columns of 'C' need to match."); | 
				
			||||
 | 
				
			||||
#ifdef CUDA | 
				
			||||
    uint32_t m = C.shape().rows(); | 
				
			||||
    uint32_t n = C.shape().cols(); | 
				
			||||
    auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; | 
				
			||||
    CUBLAS_CHECK( | 
				
			||||
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); | 
				
			||||
    invoke<T>(cublasSdgmm, cublasDdgmm, Manager::get()->cublasHandle(), m, n, A.dataDevice(), | 
				
			||||
              A.shape().rows(), X.dataDevice(), 1, C.dataDevice(), m); | 
				
			||||
#else | 
				
			||||
    if (left) { | 
				
			||||
        C.eigenMap() = X.eigenMap().asDiagonal() * A.eigenMap(); | 
				
			||||
    } else { | 
				
			||||
        C.eigenMap() = A.eigenMap() * X.eigenMap().asDiagonal(); | 
				
			||||
    } | 
				
			||||
#endif | 
				
			||||
    return StreamID{stream}; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
//////////////////////////////
 | 
				
			||||
// PLUArray Related Objects //
 | 
				
			||||
//////////////////////////////
 | 
				
			||||
 | 
				
			||||
///////////////////////////
 | 
				
			||||
// PartialPivLU Wrapper  //
 | 
				
			||||
///////////////////////////
 | 
				
			||||
 | 
				
			||||
// This class is just a workaround to use Eigen's internals directly.
 | 
				
			||||
template <typename T> class PartialPivLU; | 
				
			||||
namespace internal { | 
				
			||||
template <typename T> static Array<T> empty({1, 1}); | 
				
			||||
template <typename T> static EigenMapMat<T> empty_map = empty<T>.eigenMap(); | 
				
			||||
}; // namespace internal
 | 
				
			||||
 | 
				
			||||
template <typename T, ENABLE_IF(IS_FLOAT(T)) = true> class PLUArray; | 
				
			||||
// This is a wrapper class for Eigen's class so we have more controlled access to
 | 
				
			||||
// the underlying data.
 | 
				
			||||
template <typename T> class PartialPivLU : public Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>> { | 
				
			||||
  private: | 
				
			||||
    using Base = Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>>; | 
				
			||||
    template <typename U, ENABLE_IF(IS_FLOAT(U))> friend class PLUArray; | 
				
			||||
 | 
				
			||||
    EigenMapMat<T> mMapLU; | 
				
			||||
    EigenMapMat<int32_t> mMapPivots; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    PartialPivLU() | 
				
			||||
        : Base(internal::empty_map<T>), mMapLU(internal::empty_map<T>), | 
				
			||||
          mMapPivots(internal::empty_map<int32_t>){}; | 
				
			||||
 | 
				
			||||
    void make(const Array<T>& lu, const Array<int32_t>& pivots) { | 
				
			||||
 | 
				
			||||
        new (&mMapLU) EigenMapMat<T>(lu.eigenMap()); | 
				
			||||
        new (&mMapPivots) EigenMapMat<int32_t>(pivots.atLeast2D().eigenMap()); | 
				
			||||
 | 
				
			||||
        new (&this->m_lu) decltype(Base::m_lu)(mMapLU.derived()); | 
				
			||||
        new (&this->m_p) decltype(Base::m_p)(mMapPivots.derived()); | 
				
			||||
 | 
				
			||||
        // new (&this->m_rowsTranspositions) decltype(Base::m_rowsTranspositions)(
 | 
				
			||||
        //     mMapPivots.derived());
 | 
				
			||||
 | 
				
			||||
        this->m_l1_norm = 0; | 
				
			||||
        this->m_det_p = 0; | 
				
			||||
        this->m_isInitialized = true; | 
				
			||||
    }; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
namespace internal { | 
				
			||||
// We only create one and copy-construct to avoid the re-initialization.
 | 
				
			||||
template <typename T> static PartialPivLU<T> BlankPPLU = PartialPivLU<T>(); | 
				
			||||
}; // namespace internal
 | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Class for storing the PLU decomposition an Array. This is restricted to floating point types. | 
				
			||||
 */ | 
				
			||||
template <typename T, ENABLE_IF(IS_FLOAT(T))> class PLUArray { | 
				
			||||
  private: | 
				
			||||
    Array<T> mLU; | 
				
			||||
    Array<int32_t> mPivots; | 
				
			||||
    PartialPivLU<T> mPPLU = internal::BlankPPLU<T>; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    PLUArray() = delete; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructor for a PLUArray given the matrix dimension. | 
				
			||||
     */ | 
				
			||||
    PLUArray(const uint32_t n) : mLU({n, n}), mPivots({n}) { mPPLU.make(mLU, mPivots); }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructor for a PLUArray given an existing array. | 
				
			||||
     */ | 
				
			||||
    PLUArray(const Array<T>& arr) | 
				
			||||
        : mLU((arr.isView()) ? arr.view() : arr), mPivots({arr.shape().rows()}) { | 
				
			||||
        CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); | 
				
			||||
        CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); | 
				
			||||
        mPPLU.make(mLU, mPivots); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructor for a PLUArray given an existing location in memory for both the matrix and | 
				
			||||
     * the pivots. | 
				
			||||
     */ | 
				
			||||
    PLUArray(const Array<T>& arr, const Array<int32_t> pivots) | 
				
			||||
        : mLU(arr.view()), mPivots(pivots.view()) { | 
				
			||||
        CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); | 
				
			||||
        CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); | 
				
			||||
        mPPLU.make(mLU, mPivots); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    uint32_t rank() { return mLU.shape().rows(); }; /**< Gets the rank of the LU matrix. */ | 
				
			||||
    Array<T> LU() const { return mLU.view(); };     /**< Gets the LU matrix. */ | 
				
			||||
    Array<int32_t> pivots() const { return mPivots.view(); }; /**< Gets the LU matrix. */ | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Comptues the inplace LU factorization for this array on CPU. | 
				
			||||
     */ | 
				
			||||
    void computeLU() { | 
				
			||||
        mPPLU.compute(); | 
				
			||||
        mPPLU.mMapPivots = mPPLU.permutationP().indices(); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Solves the system \f$ LUx = b \f$ and returns \f$x\f$. | 
				
			||||
     */ | 
				
			||||
    Array<T> solve(const Array<T>& b) { | 
				
			||||
        Array<T> x(b.shape()); | 
				
			||||
        x.eigenMap() = mPPLU.solve(b.eigenMap()); | 
				
			||||
        return x; | 
				
			||||
    }; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * This is a batch version of PLUArray, to enable usage of the cuBLAS API. This is restricted to | 
				
			||||
 * floating point types. | 
				
			||||
 */ | 
				
			||||
template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true> | 
				
			||||
class PLUBatch : public Batch<T> { | 
				
			||||
  private: | 
				
			||||
    Array<int32_t> mPivotsBatch; | 
				
			||||
    Array<int32_t> mInfoLU; | 
				
			||||
    int32_t mInfoSolve; | 
				
			||||
 | 
				
			||||
    bool mInitialized = false; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    /**
 | 
				
			||||
     * Constructor of a PLUBatch from a given batch size. | 
				
			||||
     */ | 
				
			||||
    PLUBatch(const uint32_t size) : Batch<T>(size), mInfoLU({size}){}; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions. | 
				
			||||
     */ | 
				
			||||
    PLUBatch(const Array<T>& arr) : Batch<T>(arr) { | 
				
			||||
        Check<T>::isSquare(arr, "LU Array"); | 
				
			||||
 | 
				
			||||
        mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()}); | 
				
			||||
        mInfoLU = Array<int32_t>({this->mBatchSize}); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Indexing operator which returns the PLUArray in the PLUBatch at the given index. | 
				
			||||
     */ | 
				
			||||
    PLUArray<T> operator[](const uint32_t index) const { | 
				
			||||
        CT_ERROR_IF(index, >=, this->mBatchSize, "Index exceeds batch size"); | 
				
			||||
        Array<T> lu(this->mBatch[index], {this->mShape.rows(), this->mShape.cols()}); | 
				
			||||
        Array<int32_t> pivots(mPivotsBatch.data() + index * this->mShape.rows(), | 
				
			||||
                              {this->mShape.rows()}); | 
				
			||||
        return PLUArray<T>(lu, pivots); | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Computes the inplace PLU decomposition of batch of arrays. | 
				
			||||
     */ | 
				
			||||
    StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) { | 
				
			||||
#ifdef CUDA | 
				
			||||
        uint32_t n = this->mShape.rows(); | 
				
			||||
        CUBLAS_CHECK( | 
				
			||||
            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); | 
				
			||||
        invoke<T>(cublasSgetrfBatched, cublasDgetrfBatched, Manager::get()->cublasHandle(), n, | 
				
			||||
                  this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), mInfoLU.dataDevice(), | 
				
			||||
                  this->mBatchSize); | 
				
			||||
 | 
				
			||||
#else | 
				
			||||
#pragma omp parallel for | 
				
			||||
        for (uint32_t i = 0; i < this->mBatchSize; ++i) { | 
				
			||||
            (*this)[i].computeLU(); | 
				
			||||
        } | 
				
			||||
#endif | 
				
			||||
        mInitialized = true; | 
				
			||||
        return stream; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Solves the batched system \f$LUx = b\f$ inplace. The solution \f$x\f$ is written back into | 
				
			||||
     * \f$b\f$. | 
				
			||||
     */ | 
				
			||||
    StreamID solve(const Batch<T>& b, const StreamID& stream = DEF_CUBLAS_STREAM) { | 
				
			||||
        CT_ERROR(not mInitialized, | 
				
			||||
                 "Cannot solve system if PLUBatch has not yet computed its LU decomposition"); | 
				
			||||
        CT_ERROR_IF(b.size(), !=, this->mBatchSize, | 
				
			||||
                    "Upper dimensions of b do not match batch size"); | 
				
			||||
        CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(), | 
				
			||||
                    "The length of each column of b must match the matrix rank"); | 
				
			||||
 | 
				
			||||
#ifdef CUDA | 
				
			||||
        uint32_t n = b.shape().rows(); | 
				
			||||
        uint32_t nrhs = b.shape().cols(); | 
				
			||||
        CUBLAS_CHECK( | 
				
			||||
            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); | 
				
			||||
        invoke<T>(cublasSgetrsBatched, cublasDgetrsBatched, Manager::get()->cublasHandle(), | 
				
			||||
                  CUBLAS_OP_N, n, nrhs, this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), | 
				
			||||
                  b.batch().dataDevice(), n, &mInfoSolve, this->mBatchSize); | 
				
			||||
 | 
				
			||||
#else | 
				
			||||
#pragma omp parallel for | 
				
			||||
        for (uint32_t i = 0; i < this->mBatchSize; ++i) { | 
				
			||||
            b[i] = (*this)[i].solve(b[i]); | 
				
			||||
        } | 
				
			||||
#endif | 
				
			||||
        return stream; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets the pivots data from the device to the host. Does nothing for CPU. | 
				
			||||
     */ | 
				
			||||
    StreamID getPivots(const StreamID& stream = DEF_MEM_STREAM) const { | 
				
			||||
        mPivotsBatch.updateHost(stream); | 
				
			||||
        return stream; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets the info array for the LU decomposition for the device to the host. Does not | 
				
			||||
     * return useful information for CPU. | 
				
			||||
     */ | 
				
			||||
    Array<int32_t> getLUInfo() const { | 
				
			||||
        mInfoLU.updateHost().wait(); | 
				
			||||
        return mInfoLU; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Checks validity of the solve operation. Does not return useful information for CPU. | 
				
			||||
     */ | 
				
			||||
    int32_t validSolve() const { return mInfoSolve == 0; } | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
// /**
 | 
				
			||||
//  * Gets the inverse of each A[i], using an already PLU factorized A[i].
 | 
				
			||||
//  * Only available if compiling with CUDA.
 | 
				
			||||
//  */
 | 
				
			||||
// template <typename T>
 | 
				
			||||
// void inverseBatch(const Array<T*>& batchA, const Array<T*>& batchC, const Array<int>&
 | 
				
			||||
// pivots,
 | 
				
			||||
//                   const Array<int>& info, const Shape shapeA, const Shape shapeC,
 | 
				
			||||
//                   const uint stream = 0) {
 | 
				
			||||
// #ifdef CUDA
 | 
				
			||||
//     CT_ERROR_IF(shapeA.rows(), !=, shapeA.cols(),
 | 
				
			||||
//           "'A' needs to be square, rows() and column need to match.");
 | 
				
			||||
//     CT_ERROR_IF(shapeA.rows(), !=, shapeC.cols(), "'A' needs to be the same shape as
 | 
				
			||||
//     'C'."); CT_ERROR_IF(shapeA.rows(), !=, shapeC.rows(), "'A' needs to be the same shape
 | 
				
			||||
//     as 'C'.");
 | 
				
			||||
 | 
				
			||||
//     CT_ERROR_IF(shapeA.rows(), !=, pivots.shape().rows(),
 | 
				
			||||
//           "Rows()/columns of 'A' and rows() of pivots need to match.");
 | 
				
			||||
//     CT_ERROR_IF(batchA.shape().rows(), !=, pivots.shape().cols(),
 | 
				
			||||
//           "Batch size and columns of pivots need to match.");
 | 
				
			||||
//     CT_ERROR_IF(info.shape().cols(), !=, 1, "Info needs to be a column vector.")
 | 
				
			||||
//     CT_ERROR_IF(batchA.shape().rows(), !=, info.shape().rows(),
 | 
				
			||||
//           "Batch size and length of info need to match.");
 | 
				
			||||
//     CT_ERROR_IF(batchA.shape().rows(), !=, batchC.shape().rows(),
 | 
				
			||||
//           "Batches 'A[i]' and 'C[i]' need to match.");
 | 
				
			||||
 | 
				
			||||
//     std::string s = "cublas" + std::to_string(stream);
 | 
				
			||||
//     CUBLAS_CHECK(
 | 
				
			||||
//         cublasSetStream(Manager::get()->cublasHandle(),
 | 
				
			||||
//         Manager::get()->stream(s)));
 | 
				
			||||
//     invoke<T>(cublasSgetriBatched, cublasDgetriBatched,
 | 
				
			||||
//     Manager::get()->cublasHandle(),
 | 
				
			||||
//               shapeA.rows(), batchA.dataDevice(), shapeA.rows(), pivots.dataDevice(),
 | 
				
			||||
//               batchC.dataDevice(), shapeC.rows(), info.dataDevice(),
 | 
				
			||||
//               batchA.shape().rows());
 | 
				
			||||
// #else
 | 
				
			||||
//     CT_ERROR_IF(true, ==, true, "inverseBatch is not callable without CUDA.");
 | 
				
			||||
// #endif
 | 
				
			||||
// }
 | 
				
			||||
 | 
				
			||||
}; // namespace BLAS
 | 
				
			||||
}; // namespace CudaTools
 | 
				
			||||
 | 
				
			||||
#endif | 
				
			||||
@ -0,0 +1,544 @@ | 
				
			||||
#ifndef CUDATOOLS_H | 
				
			||||
#define CUDATOOLS_H | 
				
			||||
 | 
				
			||||
#include "Macros.h" | 
				
			||||
#include <iostream> | 
				
			||||
#include <string> | 
				
			||||
#include <unordered_map> | 
				
			||||
#include <vector> | 
				
			||||
 | 
				
			||||
namespace CudaTools { | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Simple wrapper for the name of a stream. Its purposes is to allow for | 
				
			||||
 * 'streams' to be passed on host code, and allowing for simple syntax | 
				
			||||
 * for waiting. | 
				
			||||
 */ | 
				
			||||
struct StreamID { | 
				
			||||
  public: | 
				
			||||
    std::string id; | 
				
			||||
    StreamID() : id(""){}; | 
				
			||||
    /**
 | 
				
			||||
     * The constructor for a StreamID. | 
				
			||||
     */ | 
				
			||||
    StreamID(const std::string& id_) : id(id_){}; | 
				
			||||
    StreamID(const char* id_) : id(id_){}; | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Waits for the stream with this stream ID. | 
				
			||||
     */ | 
				
			||||
    void wait() const; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"}; | 
				
			||||
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"}; | 
				
			||||
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"}; | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Allocates memory on the device. | 
				
			||||
 */ | 
				
			||||
void* malloc(const size_t size); | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Pins memory on the host. | 
				
			||||
 */ | 
				
			||||
void pin(void* const pHost, const size_t size); | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Pushes memory from the device to the host. | 
				
			||||
 */ | 
				
			||||
StreamID push(void* const pHost, void* const pDevice, const size_t size, | 
				
			||||
              const StreamID& stream = DEF_MEM_STREAM); | 
				
			||||
/**
 | 
				
			||||
 * Pulls memory from the device back to the host. | 
				
			||||
 */ | 
				
			||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size, | 
				
			||||
              const StreamID& stream = DEF_MEM_STREAM); | 
				
			||||
/**
 | 
				
			||||
 * Copies memory on the device to another location on the device. | 
				
			||||
 */ | 
				
			||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, | 
				
			||||
                    const StreamID& stream = DEF_MEM_STREAM); | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Frees memory on the device. | 
				
			||||
 */ | 
				
			||||
void free(void* const pDevice); | 
				
			||||
 | 
				
			||||
#ifdef CUDACC | 
				
			||||
cudaDeviceProp getDeviceProp(); | 
				
			||||
static cudaDeviceProp DeviceProperties = getDeviceProp(); | 
				
			||||
const char* cublasGetErrorString(cublasStatus_t status); | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * A class that manages various CUDA Runtime components, such as | 
				
			||||
 * streams, events, and handles. | 
				
			||||
 */ | 
				
			||||
class Manager { | 
				
			||||
  private: | 
				
			||||
    static Manager mManagerInstance; | 
				
			||||
    Manager(const std::vector<std::string>& names); | 
				
			||||
    ~Manager(); | 
				
			||||
#ifdef CUDACC | 
				
			||||
    std::unordered_map<std::string, cudaStream_t> mStreams; | 
				
			||||
    cublasHandle_t mCublas; | 
				
			||||
#endif | 
				
			||||
  public: | 
				
			||||
    /**
 | 
				
			||||
     * Used to get the global CudaTools::Manager instance. | 
				
			||||
     */ | 
				
			||||
    static Manager* get() { return &mManagerInstance; }; | 
				
			||||
 | 
				
			||||
    void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */ | 
				
			||||
    void sync() const;                          /**< Waits until all device code has finished. */ | 
				
			||||
    void addStream(const std::string& name);    /**< Creates a stream with the given name. */ | 
				
			||||
#ifdef CUDACC | 
				
			||||
    cudaStream_t stream(const StreamID& stream) const; | 
				
			||||
    cublasHandle_t cublasHandle() const; | 
				
			||||
#endif | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
namespace Kernel { | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * A struct that contains the kernel launch parameters. | 
				
			||||
 */ | 
				
			||||
struct Settings { | 
				
			||||
  public: | 
				
			||||
#ifdef CUDACC | 
				
			||||
    dim3 blockGrid; | 
				
			||||
    dim3 threadBlock; | 
				
			||||
    size_t sharedMemoryBytes = 0; | 
				
			||||
#else | 
				
			||||
    size_t threads; | 
				
			||||
#endif | 
				
			||||
    StreamID stream; | 
				
			||||
 | 
				
			||||
    Settings() = default; | 
				
			||||
 | 
				
			||||
    void setGridDim(const size_t x);                 /**< Sets the Grid dimensions. */ | 
				
			||||
    void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */ | 
				
			||||
    void setGridDim(const size_t x, const size_t y, | 
				
			||||
                    const size_t z);                  /**< Sets the Grid dimensions. */ | 
				
			||||
    void setBlockDim(const size_t x);                 /**< Sets the Thread Block dimensions. */ | 
				
			||||
    void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */ | 
				
			||||
    void setBlockDim(const size_t x, const size_t y, | 
				
			||||
                     const size_t z); /**< Sets the Thread Block dimensions. */ | 
				
			||||
 | 
				
			||||
    void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */ | 
				
			||||
    void setStream(const StreamID& stream);    /**< Sets the stream. */ | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * Returns a kernel launch parameters based on the number of threads, and optionally | 
				
			||||
 * a stream. Should only be used for 'embarassingly parallel' situations, or where | 
				
			||||
 * each thread corresponds some sort of index. | 
				
			||||
 */ | 
				
			||||
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM); | 
				
			||||
 | 
				
			||||
}; // namespace Kernel
 | 
				
			||||
 | 
				
			||||
template <typename T> class Array; | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * A class that holds information about an Array. | 
				
			||||
 */ | 
				
			||||
class Shape { | 
				
			||||
  private: | 
				
			||||
    template <typename T> friend class Array; | 
				
			||||
    uint32_t mAxes; | 
				
			||||
    uint32_t mItems; | 
				
			||||
    uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0}; | 
				
			||||
    uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0}; | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    HD Shape() : mAxes(0), mItems(1){}; | 
				
			||||
    /**
 | 
				
			||||
     * The constructor for a Shape. | 
				
			||||
     * \param dims an initializer list of the dimensions. | 
				
			||||
     */ | 
				
			||||
    HD Shape(const std::initializer_list<uint32_t> dims); | 
				
			||||
 | 
				
			||||
    HD uint32_t axes() const;  /**< Gets the number of axes. */ | 
				
			||||
    HD uint32_t items() const; /**< Gets the total number of items. */ | 
				
			||||
 | 
				
			||||
    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
 | 
				
			||||
                                   of the last axis. */ | 
				
			||||
    HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
 | 
				
			||||
                                   dimension of the second to last axis. */ | 
				
			||||
    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets the
 | 
				
			||||
                                   dimension of the second to last axis. */ | 
				
			||||
 | 
				
			||||
    HD uint32_t | 
				
			||||
    dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */ | 
				
			||||
    HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */ | 
				
			||||
 | 
				
			||||
    /**
 | 
				
			||||
     * Gets the shape at a specific axis of this shape. | 
				
			||||
     * \param axis the axis of where the new shape starts. | 
				
			||||
     */ | 
				
			||||
    HD Shape subshape(const uint32_t axis) const; | 
				
			||||
 | 
				
			||||
    HD bool operator==(const Shape& s) const; /**< Equals operator. */ | 
				
			||||
    HD bool operator!=(const Shape& s) const; /**< Not equals operator. */ | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
std::ostream& operator<<(std::ostream& out, const Shape& s); | 
				
			||||
 | 
				
			||||
}; // namespace CudaTools
 | 
				
			||||
 | 
				
			||||
#ifdef CUDATOOLS_IMPLEMENTATION | 
				
			||||
 | 
				
			||||
namespace CudaTools { | 
				
			||||
 | 
				
			||||
template <typename T, typename... Args> | 
				
			||||
StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) { | 
				
			||||
#ifdef CUDA | 
				
			||||
    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes, | 
				
			||||
           Manager::get()->stream(sett.stream.id)>>>(args...); | 
				
			||||
#else | 
				
			||||
    func(args...); | 
				
			||||
#endif | 
				
			||||
    return sett.stream; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
////////////////////
 | 
				
			||||
// Memory Methods //
 | 
				
			||||
////////////////////
 | 
				
			||||
 | 
				
			||||
void StreamID::wait() const { Manager::get()->waitFor(id); } | 
				
			||||
 | 
				
			||||
void* malloc(const size_t size) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    void* pDevice; | 
				
			||||
    CUDA_CHECK(cudaMalloc(&pDevice, size)); | 
				
			||||
    return pDevice; | 
				
			||||
#else | 
				
			||||
    return nullptr; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void free(void* const pDevice) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice)); | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice, | 
				
			||||
                               Manager::get()->stream(stream.id))); | 
				
			||||
#endif | 
				
			||||
    return stream; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost, | 
				
			||||
                               Manager::get()->stream(stream.id))); | 
				
			||||
#endif | 
				
			||||
    return stream; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, | 
				
			||||
                    const StreamID& stream) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice, | 
				
			||||
                               Manager::get()->stream(stream.id))); | 
				
			||||
#endif | 
				
			||||
    return stream; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void pin(void* const pHost, const size_t size) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault)); | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
#ifdef CUDACC | 
				
			||||
cudaDeviceProp getDeviceProp() { | 
				
			||||
    cudaSetDevice(0); | 
				
			||||
    cudaDeviceProp deviceProp; | 
				
			||||
    cudaGetDeviceProperties(&deviceProp, 0); | 
				
			||||
    return deviceProp; | 
				
			||||
} | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
/////////////////////
 | 
				
			||||
// Manager Methods //
 | 
				
			||||
/////////////////////
 | 
				
			||||
 | 
				
			||||
Manager::Manager(const std::vector<std::string>& names) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    for (auto name : names) { | 
				
			||||
        addStream(name); | 
				
			||||
    } | 
				
			||||
    CUBLAS_CHECK(cublasCreate(&mCublas)); | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
Manager::~Manager() { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    for (auto& it : mStreams) { | 
				
			||||
        CUDA_CHECK(cudaStreamDestroy(it.second)); | 
				
			||||
    } | 
				
			||||
    CUBLAS_CHECK(cublasDestroy(mCublas)); | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Manager::waitFor(const StreamID& stream) const { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    auto it = mStreams.find(stream.id); | 
				
			||||
    if (it != mStreams.end()) { | 
				
			||||
        CUDA_CHECK(cudaStreamSynchronize(it->second)); | 
				
			||||
    } else { | 
				
			||||
        CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); | 
				
			||||
    } | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Manager::sync() const { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CUDA_CHECK(cudaDeviceSynchronize()); | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Manager::addStream(const std::string& name) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    cudaStream_t s; | 
				
			||||
    CUDA_CHECK(cudaStreamCreate(&s)); | 
				
			||||
    mStreams[name] = s; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
#ifdef CUDACC | 
				
			||||
cudaStream_t Manager::stream(const StreamID& stream) const { | 
				
			||||
    auto it = mStreams.find(stream.id); | 
				
			||||
    if (it != mStreams.end()) { | 
				
			||||
        return it->second; | 
				
			||||
    } else { | 
				
			||||
        CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); | 
				
			||||
    } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
cublasHandle_t Manager::cublasHandle() const { return mCublas; }; | 
				
			||||
 | 
				
			||||
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"}); | 
				
			||||
#else | 
				
			||||
Manager Manager::mManagerInstance = Manager({""}); | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
////////////////////
 | 
				
			||||
// Kernel Methods //
 | 
				
			||||
////////////////////
 | 
				
			||||
 | 
				
			||||
namespace Kernel { | 
				
			||||
 | 
				
			||||
void Settings::setGridDim(const size_t x) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large") | 
				
			||||
    blockGrid.x = x; | 
				
			||||
    blockGrid.y = 1; | 
				
			||||
    blockGrid.z = 1; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Settings::setGridDim(const size_t x, const size_t y) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); | 
				
			||||
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); | 
				
			||||
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); | 
				
			||||
    blockGrid.x = x; | 
				
			||||
    blockGrid.y = y; | 
				
			||||
    blockGrid.z = 1; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Settings::setGridDim(const size_t x, const size_t y, const size_t z) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); | 
				
			||||
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); | 
				
			||||
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); | 
				
			||||
    CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large."); | 
				
			||||
    blockGrid.x = x; | 
				
			||||
    blockGrid.y = y; | 
				
			||||
    blockGrid.z = z; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Settings::setBlockDim(const size_t x) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); | 
				
			||||
    threadBlock.x = x; | 
				
			||||
    threadBlock.y = 1; | 
				
			||||
    threadBlock.z = 1; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Settings::setBlockDim(const size_t x, const size_t y) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); | 
				
			||||
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); | 
				
			||||
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); | 
				
			||||
    threadBlock.x = x; | 
				
			||||
    threadBlock.y = y; | 
				
			||||
    threadBlock.z = 1; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); | 
				
			||||
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); | 
				
			||||
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); | 
				
			||||
    CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large."); | 
				
			||||
    threadBlock.x = x; | 
				
			||||
    threadBlock.y = y; | 
				
			||||
    threadBlock.z = z; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Settings::setSharedMemSize(const size_t bytes) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    sharedMemoryBytes = bytes; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Settings::setStream(const StreamID& stream_) { | 
				
			||||
#ifdef CUDACC | 
				
			||||
    stream.id = stream_.id; | 
				
			||||
#endif | 
				
			||||
} | 
				
			||||
 | 
				
			||||
Settings basic(const size_t threads, const StreamID& stream) { | 
				
			||||
    Settings sett; | 
				
			||||
#ifdef CUDACC | 
				
			||||
    auto max_threads = DeviceProperties.maxThreadsPerBlock; | 
				
			||||
    size_t grid_blocks = (threads + max_threads - 1) / max_threads;   // ceil(threads / max_threads)
 | 
				
			||||
    size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
 | 
				
			||||
    sett.setGridDim(grid_blocks); | 
				
			||||
    sett.setBlockDim(block_threads); | 
				
			||||
    sett.setStream(stream); | 
				
			||||
#else | 
				
			||||
    sett.threads = threads; | 
				
			||||
#endif | 
				
			||||
    return sett; | 
				
			||||
} | 
				
			||||
} // namespace Kernel
 | 
				
			||||
 | 
				
			||||
/////////////////////
 | 
				
			||||
// Shape Functions //
 | 
				
			||||
/////////////////////
 | 
				
			||||
 | 
				
			||||
HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) { | 
				
			||||
    CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes"); | 
				
			||||
    mAxes = dims.size(); | 
				
			||||
    if (mAxes == 0) return; | 
				
			||||
 | 
				
			||||
    auto it = dims.end() - 1; | 
				
			||||
    mItems = 1; | 
				
			||||
    for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) { | 
				
			||||
        uint32_t dim = *it; | 
				
			||||
        CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0"); | 
				
			||||
 | 
				
			||||
        mAxisDim[iAxis] = dim; | 
				
			||||
        mStride[iAxis] = mItems; | 
				
			||||
        mItems *= dim; | 
				
			||||
        --it; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    if (mAxes == 1) return; | 
				
			||||
    // Swap last two, for column major storage.
 | 
				
			||||
    mStride[mAxes - 2] = 1; | 
				
			||||
    mStride[mAxes - 1] = mAxisDim[mAxes - 2]; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
HD uint32_t Shape::axes() const { return mAxes; }; | 
				
			||||
HD uint32_t Shape::items() const { return mItems; }; | 
				
			||||
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; } | 
				
			||||
 | 
				
			||||
HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; } | 
				
			||||
 | 
				
			||||
HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; } | 
				
			||||
 | 
				
			||||
HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; } | 
				
			||||
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; } | 
				
			||||
 | 
				
			||||
HD bool Shape::operator==(const Shape& s) const { | 
				
			||||
    if (mAxes != s.mAxes) { | 
				
			||||
        return false; | 
				
			||||
    } | 
				
			||||
    for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) { | 
				
			||||
        if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) { | 
				
			||||
            return false; | 
				
			||||
        } | 
				
			||||
    } | 
				
			||||
    return true; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); } | 
				
			||||
 | 
				
			||||
HD Shape Shape::subshape(const uint32_t axis) const { | 
				
			||||
    CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes."); | 
				
			||||
    if (axis == mAxes) return Shape({1}); | 
				
			||||
 | 
				
			||||
    Shape new_shape({}); | 
				
			||||
    new_shape.mAxes = mAxes - axis; | 
				
			||||
    new_shape.mItems = mItems; | 
				
			||||
 | 
				
			||||
    for (uint32_t iAxis = 0; iAxis < axis; iAxis++) { | 
				
			||||
        new_shape.mItems /= mAxisDim[iAxis]; | 
				
			||||
    } | 
				
			||||
    for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) { | 
				
			||||
        new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis]; | 
				
			||||
        new_shape.mStride[iAxis - axis] = mStride[iAxis]; | 
				
			||||
    } | 
				
			||||
    return new_shape; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
std::ostream& operator<<(std::ostream& out, const Shape& s) { | 
				
			||||
    out << "("; | 
				
			||||
    if (s.axes() == 0) return out << ")"; | 
				
			||||
    for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) { | 
				
			||||
        out << s.dim(iAxis) << ", "; | 
				
			||||
    } | 
				
			||||
    return out << s.dim(s.axes() - 1) << ")"; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
#ifdef CUDACC | 
				
			||||
const char* cublasGetErrorString(cublasStatus_t error) { | 
				
			||||
    switch (error) { | 
				
			||||
    case CUBLAS_STATUS_SUCCESS: | 
				
			||||
        return "CUBLAS_STATUS_SUCCESS"; | 
				
			||||
 | 
				
			||||
    case CUBLAS_STATUS_NOT_INITIALIZED: | 
				
			||||
        return "CUBLAS_STATUS_NOT_INITIALIZED"; | 
				
			||||
 | 
				
			||||
    case CUBLAS_STATUS_ALLOC_FAILED: | 
				
			||||
        return "CUBLAS_STATUS_ALLOC_FAILED"; | 
				
			||||
 | 
				
			||||
    case CUBLAS_STATUS_INVALID_VALUE: | 
				
			||||
        return "CUBLAS_STATUS_INVALID_VALUE"; | 
				
			||||
 | 
				
			||||
    case CUBLAS_STATUS_ARCH_MISMATCH: | 
				
			||||
        return "CUBLAS_STATUS_ARCH_MISMATCH"; | 
				
			||||
 | 
				
			||||
    case CUBLAS_STATUS_MAPPING_ERROR: | 
				
			||||
        return "CUBLAS_STATUS_MAPPING_ERROR"; | 
				
			||||
 | 
				
			||||
    case CUBLAS_STATUS_EXECUTION_FAILED: | 
				
			||||
        return "CUBLAS_STATUS_EXECUTION_FAILED"; | 
				
			||||
 | 
				
			||||
    case CUBLAS_STATUS_INTERNAL_ERROR: | 
				
			||||
        return "CUBLAS_STATUS_INTERNAL_ERROR"; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    return "<unknown>"; | 
				
			||||
} | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
};     // namespace CudaTools
 | 
				
			||||
#endif // CUDATOOLS_IMPLEMENTATION
 | 
				
			||||
 | 
				
			||||
#endif // CUDATOOLS_H
 | 
				
			||||
@ -0,0 +1,297 @@ | 
				
			||||
#ifndef MACROS_H | 
				
			||||
#define MACROS_H | 
				
			||||
 | 
				
			||||
#include <exception> | 
				
			||||
#include <sstream> | 
				
			||||
#include <stdarg.h> | 
				
			||||
 | 
				
			||||
#if defined(CUDA) && defined(__CUDACC__) | 
				
			||||
#define CUDACC | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) | 
				
			||||
#define DEVICE | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
#ifdef CUDATOOLS_DOXYGEN | 
				
			||||
/**
 | 
				
			||||
 * \def CUDACC | 
				
			||||
 * This macro is defined when this code is being compiled by nvcc and the CUDA compilation | 
				
			||||
 * flag is set. This should be used to enclose code where CUDA specific libraries and syntax are | 
				
			||||
 * being used. | 
				
			||||
 */ | 
				
			||||
#define CUDACC | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def DEVICE | 
				
			||||
 * This macro is defined when this code is being compiled for the device. The difference between | 
				
			||||
 * this and CUDACC is that this should exclusively be used to dcide if code is being compiled | 
				
			||||
 * to execute on the device. CUDACC is only determines what compiler is being used. | 
				
			||||
 */ | 
				
			||||
#define DEVICE | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def HD | 
				
			||||
 * Mark a function in front with this if it needs to be callable on both the | 
				
			||||
 * CPU and CUDA device. | 
				
			||||
 */ | 
				
			||||
#define HD | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def SHARED | 
				
			||||
 * Mark a variable as static shared memory. | 
				
			||||
 */ | 
				
			||||
#define SHARED | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def DECLARE_KERNEL(call, ...) | 
				
			||||
 * Used to declare (in header) a CUDA kernel. | 
				
			||||
 * \param call the name of the kernel | 
				
			||||
 * \param ... the arguments of the kernel | 
				
			||||
 */ | 
				
			||||
#define DECLARE_KERNEL(call, ...) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def DEFINE_KERNEL(call, ...) | 
				
			||||
 * Used to define (in implementation) a CUDA kernel. | 
				
			||||
 * \param call the name of the kernel | 
				
			||||
 * \param ... the arguments of the kernel | 
				
			||||
 */ | 
				
			||||
#define DEFINE_KERNEL(call, ...) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def KERNEL(call, settings, ...) | 
				
			||||
 * Used to call a CUDA kernel. | 
				
			||||
 * \param call the name of the kernel | 
				
			||||
 * \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with | 
				
			||||
 * \param ... the arguments of the kernel | 
				
			||||
 */ | 
				
			||||
#define KERNEL(call, settings, ...) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def BASIC_LOOP(N) | 
				
			||||
 * Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly | 
				
			||||
 * parallel situations. Exposes the loop/thread number as iThread. | 
				
			||||
 * \param N number of iterations | 
				
			||||
 */ | 
				
			||||
#define BASIC_LOOP(N) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def DEVICE_CLASS(name) | 
				
			||||
 * Can be used inside a class declaration (header) which generates boilerplate code to allow this | 
				
			||||
 * class to be used on the device. | 
				
			||||
 * | 
				
			||||
 * This macro creates a few functions:\n | 
				
			||||
 * name* that(): returns the pointer to this instance on the device. | 
				
			||||
 * | 
				
			||||
 * void allocateDevice(): allocates the memory on the device for this class instance. | 
				
			||||
 * | 
				
			||||
 * CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance | 
				
			||||
 * of the class. | 
				
			||||
 * | 
				
			||||
 * CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates | 
				
			||||
 * the device instance of the class. | 
				
			||||
 * \param name the name of the class | 
				
			||||
 */ | 
				
			||||
#define DEVICE_CLASS(name) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def CT_ERROR_IF(a, op, b, msg) | 
				
			||||
 * Used for throwing runtime errors given a condition with an operator. | 
				
			||||
 */ | 
				
			||||
#define CT_ERROR_IF(a, op, b, msg) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def CT_ERROR(a, msg) | 
				
			||||
 * Used for throwing runtime errors given a bool. | 
				
			||||
 */ | 
				
			||||
#define CT_ERROR(a, msg) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def CUDA_CHECK(call) | 
				
			||||
 * Gets the error generated by a CUDA function call if there is one. | 
				
			||||
 * \param call CUDA function to check if there are errors when running. | 
				
			||||
 */ | 
				
			||||
#define CUDA_CHECK(call) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def CUBLAS_CHECK(call) | 
				
			||||
 * Gets the error generated by a cuBLAS function call if there is one. | 
				
			||||
 * \param call cuBLAS function to check if there are errors when running. | 
				
			||||
 */ | 
				
			||||
#define CUBLAS_CHECK(call) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * \def CUDA_MEM(call) | 
				
			||||
 * Gets the GPU memory used from function call if there is one. | 
				
			||||
 * \param call function to measure memory usage. | 
				
			||||
 * \param name an identifier to use as a variable and when printing. Must satisfy variable naming. | 
				
			||||
 */ | 
				
			||||
#define CUDA_MEM(call, name) | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
///////////////////
 | 
				
			||||
// KERNEL MACROS //
 | 
				
			||||
///////////////////
 | 
				
			||||
 | 
				
			||||
#ifdef CUDACC | 
				
			||||
 | 
				
			||||
#include <cublas_v2.h> | 
				
			||||
#include <cuda_runtime.h> | 
				
			||||
 | 
				
			||||
#define HD __host__ __device__ | 
				
			||||
#define SHARED __shared__ | 
				
			||||
 | 
				
			||||
#define DECLARE_KERNEL(call, ...) __global__ void call(__VA_ARGS__) | 
				
			||||
 | 
				
			||||
#define DEFINE_KERNEL(call, ...)                                                                   \ | 
				
			||||
    template CudaTools::StreamID CudaTools::runKernel(                                             \
 | 
				
			||||
        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
 | 
				
			||||
    __global__ void call(__VA_ARGS__) | 
				
			||||
 | 
				
			||||
#else | 
				
			||||
#define HD | 
				
			||||
#define SHARED | 
				
			||||
 | 
				
			||||
#define DECLARE_KERNEL(call, ...) void call(__VA_ARGS__) | 
				
			||||
 | 
				
			||||
#define DEFINE_KERNEL(call, ...)                                                                   \ | 
				
			||||
    template CudaTools::StreamID CudaTools::runKernel(                                             \
 | 
				
			||||
        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
 | 
				
			||||
    void call(__VA_ARGS__) | 
				
			||||
 | 
				
			||||
#endif // CUDACC
 | 
				
			||||
 | 
				
			||||
#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__) | 
				
			||||
 | 
				
			||||
///////////////////
 | 
				
			||||
// DEVICE MACROS //
 | 
				
			||||
///////////////////
 | 
				
			||||
 | 
				
			||||
#ifdef DEVICE | 
				
			||||
 | 
				
			||||
#define BASIC_LOOP(N)                                                                              \ | 
				
			||||
    uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x;                                      \
 | 
				
			||||
    if (iThread < N) | 
				
			||||
#else | 
				
			||||
#define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread) | 
				
			||||
 | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
//////////////////
 | 
				
			||||
// CLASS MACROS //
 | 
				
			||||
//////////////////
 | 
				
			||||
 | 
				
			||||
#define UPDATE_FUNC(name)                                                                          \ | 
				
			||||
    inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream =                      \
 | 
				
			||||
                                              CudaTools::DEF_MEM_STREAM) {                         \
 | 
				
			||||
        return CudaTools::pull(this, that(), sizeof(name));                                        \
 | 
				
			||||
    };                                                                                             \
 | 
				
			||||
    inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream =                    \
 | 
				
			||||
                                                CudaTools::DEF_MEM_STREAM) {                       \
 | 
				
			||||
        return CudaTools::push(this, that(), sizeof(name));                                        \
 | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
#ifdef CUDA | 
				
			||||
 | 
				
			||||
#define DEVICE_CLASS(name)                                                                         \ | 
				
			||||
  private:                                                                                         \
 | 
				
			||||
    name* __deviceInstance__ = nullptr;                                                            \
 | 
				
			||||
                                                                                                   \
 | 
				
			||||
  public:                                                                                          \
 | 
				
			||||
    inline name* that() { return __deviceInstance__; }                                             \
 | 
				
			||||
    inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \
 | 
				
			||||
    UPDATE_FUNC(name) | 
				
			||||
 | 
				
			||||
#else | 
				
			||||
#define DEVICE_CLASS(name)                                                                         \ | 
				
			||||
  public:                                                                                          \
 | 
				
			||||
    inline name* that() { return this; };                                                          \
 | 
				
			||||
    inline void allocateDevice(){};                                                                \
 | 
				
			||||
    UPDATE_FUNC(name) | 
				
			||||
 | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
#ifndef CUDATOOLS_ARRAY_MAX_AXES | 
				
			||||
/**
 | 
				
			||||
 * \def CUDATOOLS_ARRAY_MAX_AXES | 
				
			||||
 * The maximum number of axes/dimensions an CudaTools::Array can have. The default is | 
				
			||||
 * set to 4, but can be manully set fit the program needs. | 
				
			||||
 */ | 
				
			||||
#define CUDATOOLS_ARRAY_MAX_AXES 4 | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
////////////////////
 | 
				
			||||
// Error Checking //
 | 
				
			||||
////////////////////
 | 
				
			||||
 | 
				
			||||
#ifndef NO_DIMENSION_CHECK | 
				
			||||
#ifdef DEVICE | 
				
			||||
#define CT_ERROR_IF(a, op, b, msg)                                                                 \ | 
				
			||||
    if (a op b) {                                                                                  \
 | 
				
			||||
        printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg);     \
 | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
#define CT_ERROR(a, msg)                                                                           \ | 
				
			||||
    if (a) {                                                                                       \
 | 
				
			||||
        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
 | 
				
			||||
    } | 
				
			||||
#else | 
				
			||||
 | 
				
			||||
#define CT_ERROR_IF(a, op, b, msg)                                                                 \ | 
				
			||||
    if (a op b) {                                                                                  \
 | 
				
			||||
        std::ostringstream os_a;                                                                   \
 | 
				
			||||
        std::ostringstream os_b;                                                                   \
 | 
				
			||||
        os_a << a;                                                                                 \
 | 
				
			||||
        os_b << b;                                                                                 \
 | 
				
			||||
        printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg,  \
 | 
				
			||||
               os_a.str().c_str(), os_b.str().c_str());                                            \
 | 
				
			||||
        throw std::exception();                                                                    \
 | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
#define CT_ERROR(a, msg)                                                                           \ | 
				
			||||
    if (a) {                                                                                       \
 | 
				
			||||
        printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg);                        \
 | 
				
			||||
        throw std::exception();                                                                    \
 | 
				
			||||
    } | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
#endif // NO_DIMENSION_CHECK
 | 
				
			||||
 | 
				
			||||
#if defined(CUDACC) && !defined(NO_CUDA_CHECK) | 
				
			||||
 | 
				
			||||
#define CUDA_CHECK(call)                                                                           \ | 
				
			||||
    do {                                                                                           \
 | 
				
			||||
        cudaError_t err = (call);                                                                  \
 | 
				
			||||
        if (err != cudaSuccess) {                                                                  \
 | 
				
			||||
            printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err));          \
 | 
				
			||||
            throw std::exception();                                                                \
 | 
				
			||||
        }                                                                                          \
 | 
				
			||||
    } while (0) | 
				
			||||
 | 
				
			||||
#define CUBLAS_CHECK(call)                                                                         \ | 
				
			||||
    do {                                                                                           \
 | 
				
			||||
        cublasStatus_t err = (call);                                                               \
 | 
				
			||||
        if (err != CUBLAS_STATUS_SUCCESS) {                                                        \
 | 
				
			||||
            printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__,                                  \
 | 
				
			||||
                   CudaTools::cublasGetErrorString(err));                                          \
 | 
				
			||||
            throw std::exception();                                                                \
 | 
				
			||||
        }                                                                                          \
 | 
				
			||||
    } while (0) | 
				
			||||
 | 
				
			||||
#define CUDA_MEM(call, name)                                                                       \ | 
				
			||||
    size_t free_bef_##name, free_aft_##name;                                                       \
 | 
				
			||||
    cudaMemGetInfo(&free_bef_##name, NULL);                                                        \
 | 
				
			||||
    call;                                                                                          \
 | 
				
			||||
    CudaTools::Manager::get()->sync();                                                             \
 | 
				
			||||
    cudaMemGetInfo(&free_aft_##name, NULL);                                                        \
 | 
				
			||||
    printf("[%s] GPU Memory Usage: %iMiB\n", #name,                                                \
 | 
				
			||||
           (free_bef_##name - free_aft_##name) / (1024 * 1024)); | 
				
			||||
 | 
				
			||||
#else | 
				
			||||
#define CUDA_CHECK(call) (call) | 
				
			||||
#define CUBLAS_CHECK(call) (call) | 
				
			||||
#define CUDA_MEM(call, name) (call) | 
				
			||||
#endif | 
				
			||||
 | 
				
			||||
#endif // MACROS_H
 | 
				
			||||
@ -0,0 +1,95 @@ | 
				
			||||
CC := g++-10
 | 
				
			||||
NVCC := nvcc
 | 
				
			||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 | 
				
			||||
NVCC_FLAGS := -MMD -w -Xcompiler
 | 
				
			||||
 | 
				
			||||
INCLUDE :=
 | 
				
			||||
LIBS_DIR :=
 | 
				
			||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
 | 
				
			||||
LIBS :=
 | 
				
			||||
LIBS_GPU := cuda cudart cublas
 | 
				
			||||
 | 
				
			||||
TARGET = tests
 | 
				
			||||
SRC_DIR = .
 | 
				
			||||
BUILD_DIR = build
 | 
				
			||||
 | 
				
			||||
# Should not need to modify below.
 | 
				
			||||
 | 
				
			||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 | 
				
			||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 | 
				
			||||
 | 
				
			||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 | 
				
			||||
 | 
				
			||||
# Get source files and object files.
 | 
				
			||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 | 
				
			||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 | 
				
			||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | 
				
			||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | 
				
			||||
 | 
				
			||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
 | 
				
			||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 | 
				
			||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 | 
				
			||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 | 
				
			||||
 | 
				
			||||
# $(info $$GCC_SRC is [${GCC_SRC}])
 | 
				
			||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
 | 
				
			||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
 | 
				
			||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 | 
				
			||||
 | 
				
			||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
 | 
				
			||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 | 
				
			||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 | 
				
			||||
 | 
				
			||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 | 
				
			||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 | 
				
			||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 | 
				
			||||
 | 
				
			||||
INC := $(INCLUDE:%=-I%)
 | 
				
			||||
LIB := $(LIBS_DIR:%=-L%)
 | 
				
			||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 | 
				
			||||
LD := $(LIBS:%=-l%)
 | 
				
			||||
LD_GPU := $(LIBS_GPU:%=-l%)
 | 
				
			||||
 | 
				
			||||
# Reminder:
 | 
				
			||||
# $< = first prerequisite
 | 
				
			||||
# $@ = the target which matched the rule
 | 
				
			||||
# $^ = all prerequisites
 | 
				
			||||
 | 
				
			||||
.PHONY: all clean | 
				
			||||
 | 
				
			||||
all : cpu gpu | 
				
			||||
 | 
				
			||||
cpu: $(TARGET)CPU | 
				
			||||
gpu: $(TARGET)GPU | 
				
			||||
 | 
				
			||||
$(TARGET)CPU: $(CPU_OBJ) | 
				
			||||
	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 | 
				
			||||
 | 
				
			||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) | 
				
			||||
	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 | 
				
			||||
# regular ones. Then, we link them all together.
 | 
				
			||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) | 
				
			||||
	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) | 
				
			||||
	$(NVCC) --device-link $^ -o $@
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) | 
				
			||||
	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) | 
				
			||||
	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
-include $(CPU_DEPS) | 
				
			||||
-include $(GPU_DEPS) | 
				
			||||
 | 
				
			||||
$(CPU_BUILD_DIR): | 
				
			||||
	mkdir -p $@
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR): | 
				
			||||
	mkdir -p $@
 | 
				
			||||
 | 
				
			||||
clean: | 
				
			||||
	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
 | 
				
			||||
@ -0,0 +1,40 @@ | 
				
			||||
========= | 
				
			||||
CudaTools | 
				
			||||
========= | 
				
			||||
This is the documentation for CudaTools, a header-only library and framework | 
				
			||||
for the development of CPU-CUDA compatible applications. Using CudaTools enables | 
				
			||||
the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to | 
				
			||||
introduce ``#ifdef`` statements when code is essentially identical between the targets. | 
				
			||||
 | 
				
			||||
For information on the library itself and its usage, view `documentation <https://acem.ece.illinois.edu/code/CudaTools>`__. The small code snippets and samples | 
				
			||||
seen in the documentation are in the folder ``samples``. | 
				
			||||
 | 
				
			||||
Dependencies | 
				
			||||
============ | 
				
			||||
- Eigen | 
				
			||||
 | 
				
			||||
In the future, we will make this dependency optional, but still provide support | 
				
			||||
for it. As of now, it is necessary. | 
				
			||||
 | 
				
			||||
Building the Documentation | 
				
			||||
========================== | 
				
			||||
The documentation is built with `Doxygen <https://doxygen.nl/>`__ and `Sphinx <https://www.sphinx-doc.org/en>`__. | 
				
			||||
So, first make sure you have Doxygen installed on your system, and make sure it is added | 
				
			||||
to your system path. Then, you will have to create a Python virtual environment | 
				
			||||
in the repository folder | 
				
			||||
 | 
				
			||||
.. code-block:: bash | 
				
			||||
 | 
				
			||||
    $ python3 -m venv .venv | 
				
			||||
 | 
				
			||||
After installing the required Python packages | 
				
			||||
 | 
				
			||||
.. code-block:: bash | 
				
			||||
 | 
				
			||||
    $ pip install -r requirements | 
				
			||||
 | 
				
			||||
you can now run the script | 
				
			||||
 | 
				
			||||
.. code-block:: bash | 
				
			||||
 | 
				
			||||
    $ ./build_docs | 
				
			||||
@ -0,0 +1,2 @@ | 
				
			||||
doxygen docs/Doxyfile | 
				
			||||
sphinx-build -b html docs/source docs/build/html | 
				
			||||
									
										
											File diff suppressed because it is too large
											Load Diff
										
									
								
							
						@ -0,0 +1,20 @@ | 
				
			||||
# Minimal makefile for Sphinx documentation
 | 
				
			||||
#
 | 
				
			||||
 | 
				
			||||
# You can set these variables from the command line, and also
 | 
				
			||||
# from the environment for the first two.
 | 
				
			||||
SPHINXOPTS    ?=
 | 
				
			||||
SPHINXBUILD   ?= sphinx-build
 | 
				
			||||
SOURCEDIR     = source
 | 
				
			||||
BUILDDIR      = build
 | 
				
			||||
 | 
				
			||||
# Put it first so that "make" without argument is like "make help".
 | 
				
			||||
help: | 
				
			||||
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 | 
				
			||||
 | 
				
			||||
.PHONY: help Makefile | 
				
			||||
 | 
				
			||||
# Catch-all target: route all unknown targets to Sphinx using the new
 | 
				
			||||
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 | 
				
			||||
%: Makefile | 
				
			||||
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 | 
				
			||||
@ -0,0 +1,35 @@ | 
				
			||||
@ECHO OFF | 
				
			||||
 | 
				
			||||
pushd %~dp0 | 
				
			||||
 | 
				
			||||
REM Command file for Sphinx documentation | 
				
			||||
 | 
				
			||||
if "%SPHINXBUILD%" == "" ( | 
				
			||||
	set SPHINXBUILD=sphinx-build | 
				
			||||
) | 
				
			||||
set SOURCEDIR=source | 
				
			||||
set BUILDDIR=build | 
				
			||||
 | 
				
			||||
if "%1" == "" goto help | 
				
			||||
 | 
				
			||||
%SPHINXBUILD% >NUL 2>NUL | 
				
			||||
if errorlevel 9009 ( | 
				
			||||
	echo. | 
				
			||||
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx | 
				
			||||
	echo.installed, then set the SPHINXBUILD environment variable to point | 
				
			||||
	echo.to the full path of the 'sphinx-build' executable. Alternatively you | 
				
			||||
	echo.may add the Sphinx directory to PATH. | 
				
			||||
	echo. | 
				
			||||
	echo.If you don't have Sphinx installed, grab it from | 
				
			||||
	echo.http://sphinx-doc.org/ | 
				
			||||
	exit /b 1 | 
				
			||||
) | 
				
			||||
 | 
				
			||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% | 
				
			||||
goto end | 
				
			||||
 | 
				
			||||
:help | 
				
			||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% | 
				
			||||
 | 
				
			||||
:end | 
				
			||||
popd | 
				
			||||
@ -0,0 +1,39 @@ | 
				
			||||
sections = { | 
				
			||||
  "mesh_prep": 1, | 
				
			||||
  "matrix_assembly": 2, | 
				
			||||
  "bc_calc": 3, | 
				
			||||
  "timestep": 4, | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
window.MathJax = { | 
				
			||||
  loader: {load: ['[tex]/tagformat', '[tex]/ams']}, | 
				
			||||
  tex: { | 
				
			||||
    packages: {'[+]': ['tagformat', 'ams']}, | 
				
			||||
    macros: { | 
				
			||||
      dd: "{\\, \\mathrm{d}}", | 
				
			||||
      E: "{\\mathbf{E}}", | 
				
			||||
      H: "{\\mathbf{H}}", | 
				
			||||
      J: "{\\mathbf{J}}", | 
				
			||||
      D: "{\\mathbf{D}}", | 
				
			||||
      B: "{\\mathbf{B}}", | 
				
			||||
      M: "{\\mathbf{M}}", | 
				
			||||
      tbE: "{\\tilde{\\E}}", | 
				
			||||
      tbH: "{\\tilde{\\H}}", | 
				
			||||
      tE: "{\\tilde{E}}", | 
				
			||||
      tH: "{\\tilde{H}}", | 
				
			||||
      tphi: "{\\tilde{\\phi}}", | 
				
			||||
      curl: ["{\\nabla \\times {#1}}", 1], | 
				
			||||
      div: ["{\\nabla \\cdot {#1}}", 1], | 
				
			||||
      tens: ["{\\bar{\\bar{{#1}}}}", 1], | 
				
			||||
    }, | 
				
			||||
    tags: 'ams', | 
				
			||||
    tagformat: { | 
				
			||||
      number: (n) => sections[window.location.pathname.split("/").pop().split(".")[0]] + '.' + n, | 
				
			||||
    }, | 
				
			||||
    ams: { | 
				
			||||
      multilineWidth: '100%', | 
				
			||||
      multilineIndent: '50em' | 
				
			||||
    } | 
				
			||||
  }, | 
				
			||||
} | 
				
			||||
@ -0,0 +1,26 @@ | 
				
			||||
======= | 
				
			||||
Array.h | 
				
			||||
======= | 
				
			||||
 | 
				
			||||
The ``Array.h`` header file contains the Array class, and its related classes. For this | 
				
			||||
file only, assume that every functions is callable on both host and device unless | 
				
			||||
explicitly mentioned otherwise. | 
				
			||||
 | 
				
			||||
CudaTools::Shape | 
				
			||||
---------------- | 
				
			||||
.. doxygenclass:: CudaTools::Shape | 
				
			||||
    :members: | 
				
			||||
    :allow-dot-graphs: | 
				
			||||
 | 
				
			||||
CudaTools::ArrayIterator<T> | 
				
			||||
--------------------------- | 
				
			||||
.. doxygenclass:: CudaTools::ArrayIterator | 
				
			||||
    :members: | 
				
			||||
    :allow-dot-graphs: | 
				
			||||
 | 
				
			||||
CudaTools::Array<T> | 
				
			||||
------------------- | 
				
			||||
.. doxygenclass:: CudaTools::Array | 
				
			||||
    :members: | 
				
			||||
    :private-members: | 
				
			||||
    :allow-dot-graphs: | 
				
			||||
@ -0,0 +1,45 @@ | 
				
			||||
====== | 
				
			||||
BLAS.h | 
				
			||||
====== | 
				
			||||
 | 
				
			||||
The ``BLAS.h`` header file contains some BLAS functions, and some related | 
				
			||||
classes for those functions. | 
				
			||||
 | 
				
			||||
BLAS Functions | 
				
			||||
============== | 
				
			||||
Currently, these are the supported BLAS functions. They are inherited mainly | 
				
			||||
from the cuBLAS API, and condensed into a unified functions. The plan is to | 
				
			||||
add them as necessary. | 
				
			||||
 | 
				
			||||
CudaTools::BLAS::GEMV<T> | 
				
			||||
------------------------ | 
				
			||||
.. doxygenfunction:: CudaTools::BLAS::GEMV | 
				
			||||
 | 
				
			||||
CudaTools::BLAS::GEMM<T> | 
				
			||||
------------------------ | 
				
			||||
.. doxygenfunction:: CudaTools::BLAS::GEMM | 
				
			||||
 | 
				
			||||
CudaTools::BLAS::DGMM<T> | 
				
			||||
------------------------ | 
				
			||||
.. doxygenfunction:: CudaTools::BLAS::DGMM | 
				
			||||
 | 
				
			||||
BLAS Classes | 
				
			||||
============ | 
				
			||||
 | 
				
			||||
These classes also inherit functions from the cuBLAS API, but are packaged | 
				
			||||
into classes that are more intuitive and hide external details. | 
				
			||||
 | 
				
			||||
CudaTools::BLAS::Batch<T> | 
				
			||||
------------------------- | 
				
			||||
.. doxygenclass:: CudaTools::BLAS::Batch | 
				
			||||
    :members: | 
				
			||||
 | 
				
			||||
CudaTools::BLAS::PLUArray<T> | 
				
			||||
---------------------------- | 
				
			||||
.. doxygenclass:: CudaTools::BLAS::PLUArray | 
				
			||||
    :members: | 
				
			||||
 | 
				
			||||
CudaTools::BLAS::PLUBatch<T> | 
				
			||||
---------------------------- | 
				
			||||
.. doxygenclass:: CudaTools::BLAS::PLUBatch | 
				
			||||
    :members: | 
				
			||||
@ -0,0 +1,53 @@ | 
				
			||||
# Configuration file for the Sphinx documentation builder. | 
				
			||||
 | 
				
			||||
# -- Project information | 
				
			||||
 | 
				
			||||
project = 'DGEMS' | 
				
			||||
copyright = '2022' | 
				
			||||
author = 'Kenneth Jao, Qi Jian Lim' | 
				
			||||
 | 
				
			||||
release = '0.1' | 
				
			||||
version = '0.1.0' | 
				
			||||
 | 
				
			||||
# -- General configuration | 
				
			||||
 | 
				
			||||
html_static_path = ["_static"] | 
				
			||||
html_js_files = ["js/mathjax-config.js"] | 
				
			||||
 | 
				
			||||
extensions = [ | 
				
			||||
    'sphinx.ext.duration', | 
				
			||||
    'sphinx.ext.doctest', | 
				
			||||
    'sphinx.ext.autodoc', | 
				
			||||
    'sphinx.ext.autosummary', | 
				
			||||
    'sphinx.ext.autosectionlabel', | 
				
			||||
    'sphinx.ext.intersphinx', | 
				
			||||
    'sphinx.ext.mathjax', | 
				
			||||
    'sphinx.ext.graphviz', | 
				
			||||
    'sphinxcontrib.bibtex', | 
				
			||||
    'breathe', | 
				
			||||
] | 
				
			||||
 | 
				
			||||
breathe_projects = {"DGEMS": "../build/xml"} | 
				
			||||
breathe_default_project = "DGEMS" | 
				
			||||
 | 
				
			||||
bibtex_bibfiles = ['refs.bib'] | 
				
			||||
 | 
				
			||||
mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" | 
				
			||||
 | 
				
			||||
intersphinx_mapping = { | 
				
			||||
    'python': ('https://docs.python.org/3/', None), | 
				
			||||
    'sphinx': ('https://www.sphinx-doc.org/en/master/', None), | 
				
			||||
} | 
				
			||||
intersphinx_disabled_domains = ['std'] | 
				
			||||
 | 
				
			||||
templates_path = ['_templates'] | 
				
			||||
 | 
				
			||||
# -- Options for HTML output | 
				
			||||
 | 
				
			||||
html_theme = 'sphinx_rtd_theme' | 
				
			||||
html_theme_options = { | 
				
			||||
    'collapse_navigation': False, | 
				
			||||
} | 
				
			||||
 | 
				
			||||
# -- Options for EPUB output | 
				
			||||
epub_show_urls = 'footnote' | 
				
			||||
@ -0,0 +1,67 @@ | 
				
			||||
====== | 
				
			||||
Core.h | 
				
			||||
====== | 
				
			||||
 | 
				
			||||
The ``Core.h`` header file defines several compiler flags and macros along with | 
				
			||||
a few core classes. | 
				
			||||
 | 
				
			||||
Flags | 
				
			||||
===== | 
				
			||||
 | 
				
			||||
Device Indicators | 
				
			||||
----------------- | 
				
			||||
.. doxygendefine:: CUDACC | 
				
			||||
.. doxygendefine:: DEVICE | 
				
			||||
 | 
				
			||||
Host-Device Automation | 
				
			||||
---------------------- | 
				
			||||
.. doxygendefine:: HD | 
				
			||||
.. doxygendefine:: SHARED | 
				
			||||
 | 
				
			||||
Compilation Options | 
				
			||||
------------------- | 
				
			||||
.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES | 
				
			||||
 | 
				
			||||
Macros | 
				
			||||
====== | 
				
			||||
 | 
				
			||||
Kernel | 
				
			||||
------ | 
				
			||||
.. doxygendefine:: DECLARE_KERNEL | 
				
			||||
.. doxygendefine:: DEFINE_KERNEL | 
				
			||||
.. doxygendefine:: KERNEL | 
				
			||||
 | 
				
			||||
Device Helpers | 
				
			||||
-------------- | 
				
			||||
 | 
				
			||||
.. doxygendefine:: BASIC_LOOP | 
				
			||||
 | 
				
			||||
Device Class | 
				
			||||
------------ | 
				
			||||
 | 
				
			||||
.. doxygendefine:: DEVICE_CLASS | 
				
			||||
 | 
				
			||||
 | 
				
			||||
Classes and Structs | 
				
			||||
=================== | 
				
			||||
 | 
				
			||||
CudaTools::StreamID | 
				
			||||
------------------- | 
				
			||||
 | 
				
			||||
.. doxygenstruct:: CudaTools::StreamID | 
				
			||||
 | 
				
			||||
CudaTools::Manager | 
				
			||||
------------------ | 
				
			||||
 | 
				
			||||
.. doxygenclass:: CudaTools::Manager | 
				
			||||
    :members: | 
				
			||||
 | 
				
			||||
CudaTools::Kernel::Settings | 
				
			||||
--------------------------- | 
				
			||||
 | 
				
			||||
.. doxygenstruct:: CudaTools::Kernel::Settings | 
				
			||||
    :members: | 
				
			||||
 | 
				
			||||
CudaTools::Kernel::Basic | 
				
			||||
------------------------ | 
				
			||||
.. doxygenfunction:: CudaTools::Kernel::basic | 
				
			||||
@ -0,0 +1,25 @@ | 
				
			||||
========= | 
				
			||||
CudaTools | 
				
			||||
========= | 
				
			||||
This is the documentation for CudaTools, a header-only library and framework | 
				
			||||
for the development of CPU-CUDA compatible applications. Using CudaTools enables | 
				
			||||
the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to | 
				
			||||
introduce ``#ifdef`` statements when code is essentially identical between the targets. | 
				
			||||
 | 
				
			||||
To get started, please head over to the :doc:`usage` section. For more detail on the | 
				
			||||
machinery underneath, please refer to the other other sections. | 
				
			||||
 | 
				
			||||
.. note:: | 
				
			||||
 | 
				
			||||
   If you would like to contribute, please visit the `git page <https://git.acem.ece.illinois.edu/kjao/CudaTools>`__. | 
				
			||||
 | 
				
			||||
Contents | 
				
			||||
======== | 
				
			||||
 | 
				
			||||
.. toctree:: | 
				
			||||
   :maxdepth: 2 | 
				
			||||
 | 
				
			||||
   usage | 
				
			||||
   core | 
				
			||||
   array | 
				
			||||
   blas | 
				
			||||
@ -0,0 +1,128 @@ | 
				
			||||
================== | 
				
			||||
Usage and Examples | 
				
			||||
================== | 
				
			||||
 | 
				
			||||
 | 
				
			||||
This library is broken up into three main parts, as well as a certain | 
				
			||||
compilation and linking framework: | 
				
			||||
 | 
				
			||||
#. :ref:`Core Examples` | 
				
			||||
#. :ref:`Array Examples` | 
				
			||||
#. :ref:`BLAS Examples` | 
				
			||||
#. :ref:`Compilation and Linking` | 
				
			||||
 | 
				
			||||
The ``Core.h`` header contains the necessary macros, flags and objects for interfacing with | 
				
			||||
basic kernel launching and the CUDA Runtime API. The ``Array.h`` header contains the ``CudaTools::Array`` | 
				
			||||
class which provides a device compatible Array-like class with easy memory management. Lastly, | 
				
			||||
the ``BLAS.h`` header provides functions BLAS functions through the the cuBLAS library for the GPU, | 
				
			||||
and Eigen for the CPU. Lastly, a templated Makefile is provided which can be used | 
				
			||||
for your own project, after following a few rules. | 
				
			||||
 | 
				
			||||
The usage of this libary will be illustrated through examples, and further details | 
				
			||||
can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder. | 
				
			||||
Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers | 
				
			||||
to a function runnable on the CPU, and a device function refers to a function that is runnable | 
				
			||||
on a device. A kernel is a specific function that the host can call to be run on the device. | 
				
			||||
 | 
				
			||||
Core Examples | 
				
			||||
============= | 
				
			||||
This file mainly introduces compiler macros and a few classes that are used to improve the | 
				
			||||
syntax between host and device code. To define and call a kernel, there are a few | 
				
			||||
macros provided. For example, | 
				
			||||
 | 
				
			||||
.. code-block:: cpp | 
				
			||||
 | 
				
			||||
    DEFINE_KERNEL(add, int x, int y) { | 
				
			||||
        printf("Kernel: %i\n", x + y); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    int main() { | 
				
			||||
        KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2. | 
				
			||||
        return 0; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments. | 
				
			||||
The second argument in the ``KERNEL()`` macro is are the launch parameters for | 
				
			||||
kernel. The launch parameters have several items, but for 'embarassingly parallel' | 
				
			||||
cases, we can simply generate the settings with the number of threads. More detail with | 
				
			||||
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example, | 
				
			||||
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail, | 
				
			||||
see :ref:`here <Macros>`. | 
				
			||||
 | 
				
			||||
.. warning:: | 
				
			||||
   These kernel definitions must be in a file that will be compiled by ``nvcc``. Also, | 
				
			||||
   for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it | 
				
			||||
   and make it available to other files. | 
				
			||||
 | 
				
			||||
Since many applications used classes, a macro is provided to 'convert' a class into | 
				
			||||
being device-compatible. Following the previous example similarly, | 
				
			||||
 | 
				
			||||
.. code-block:: cpp | 
				
			||||
 | 
				
			||||
    class intPair { | 
				
			||||
        DEVICE_CLASS(intPair) | 
				
			||||
        public: | 
				
			||||
            int x, y; | 
				
			||||
 | 
				
			||||
            intPair(const int x_, const int y_) : x(x_), y(y_) { | 
				
			||||
                allocateDevice(); // Allocates memory for this intPair on the device. | 
				
			||||
                updateDevice().wait(); // Copies the memory on the host to the device and waits until finished. | 
				
			||||
            }; | 
				
			||||
 | 
				
			||||
            HD void swap() { | 
				
			||||
                int swap = x; | 
				
			||||
                x = y; | 
				
			||||
                y = swap; | 
				
			||||
            }; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); } | 
				
			||||
 | 
				
			||||
    int main() { | 
				
			||||
        intPair pair(1, 2); | 
				
			||||
        printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2. | 
				
			||||
 | 
				
			||||
        KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait(); | 
				
			||||
        pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished. | 
				
			||||
 | 
				
			||||
        printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1. | 
				
			||||
        return 0; | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
In this example, we create a class called ``intPair``, which is then made available on the device through | 
				
			||||
the ``DEVICE_CLASS(name)`` macro. Specifically, that macro introduces a few functions, like | 
				
			||||
``allocateDevice()``, ``updateDevice()``, ``updateHost()``, and ``that()``. That last function | 
				
			||||
returns a pointer to the copy on the device. For more details, see :ref:`here <Device Class>`. If we were to pass in the host pointer of the ``intPair`` to the kernel, there would be a illegal memory access. | 
				
			||||
 | 
				
			||||
The kernel argument list should **must** consist of pointers to objects, or a non-reference object. | 
				
			||||
Otherwise, compilation will fail. In general this is safer, as it forces the programmer to | 
				
			||||
acknowledge that the device copy is being passed. For the latter case of a non-reference object, | 
				
			||||
you should only do this if there is no issue in creating a copy of the original object. In the above | 
				
			||||
example, we could have done this, but for more complicated classes it may result in unwanted behavior. | 
				
			||||
 | 
				
			||||
Lastly, since the point of classes is usually to have some member functions, to have them | 
				
			||||
available on the device, you must mark them with the compiler macro ``HD`` in front. | 
				
			||||
 | 
				
			||||
We also introduce the ``wait()`` function, which waits for the command to complete before | 
				
			||||
continuing. Most calls that involve the device are asynchronous, so without proper blocking, | 
				
			||||
operations dependent on a previous command are not guaranteed to run correctly. If the code is | 
				
			||||
compiled for CPU, then everything will run synchronously, as per usual. | 
				
			||||
 | 
				
			||||
.. note:: | 
				
			||||
   Almost all functions that are asynchronous provide an optional 'stream' argument, | 
				
			||||
   where you can give the name of the stream you wish to use. Different streams run | 
				
			||||
   asynchronous, but operations on the same stream are FIFO. To define a stream to use | 
				
			||||
   later, you must call ``CudaTools::Manager::get()->addStream("myStream")`` at some point | 
				
			||||
   before you use it. For more details, see :ref:`here <CudaTools::Manager>`. | 
				
			||||
 | 
				
			||||
 | 
				
			||||
Array Examples | 
				
			||||
============== | 
				
			||||
 | 
				
			||||
 | 
				
			||||
BLAS Examples | 
				
			||||
============= | 
				
			||||
 | 
				
			||||
 | 
				
			||||
Compilation and Linking | 
				
			||||
======================= | 
				
			||||
@ -0,0 +1,4 @@ | 
				
			||||
Sphinx>=5.1.1 | 
				
			||||
sphinx-rtd-theme>=1.0.0 | 
				
			||||
sphinxcontrib-bibtex>=2.5.0 | 
				
			||||
breathe>=4.34.0 | 
				
			||||
@ -0,0 +1,95 @@ | 
				
			||||
CC := g++-10
 | 
				
			||||
NVCC := nvcc
 | 
				
			||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 | 
				
			||||
NVCC_FLAGS := -MMD -w -Xcompiler
 | 
				
			||||
 | 
				
			||||
INCLUDE := ../../
 | 
				
			||||
LIBS_DIR :=
 | 
				
			||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
 | 
				
			||||
LIBS :=
 | 
				
			||||
LIBS_GPU := cuda cudart cublas
 | 
				
			||||
 | 
				
			||||
TARGET = coreKernel
 | 
				
			||||
SRC_DIR = .
 | 
				
			||||
BUILD_DIR = build
 | 
				
			||||
 | 
				
			||||
# Should not need to modify below.
 | 
				
			||||
 | 
				
			||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 | 
				
			||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 | 
				
			||||
 | 
				
			||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 | 
				
			||||
 | 
				
			||||
# Get source files and object files.
 | 
				
			||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 | 
				
			||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 | 
				
			||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | 
				
			||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | 
				
			||||
 | 
				
			||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
 | 
				
			||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 | 
				
			||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 | 
				
			||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 | 
				
			||||
 | 
				
			||||
# $(info $$GCC_SRC is [${GCC_SRC}])
 | 
				
			||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
 | 
				
			||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
 | 
				
			||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 | 
				
			||||
 | 
				
			||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
 | 
				
			||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 | 
				
			||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 | 
				
			||||
 | 
				
			||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 | 
				
			||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 | 
				
			||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 | 
				
			||||
 | 
				
			||||
INC := $(INCLUDE:%=-I%)
 | 
				
			||||
LIB := $(LIBS_DIR:%=-L%)
 | 
				
			||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 | 
				
			||||
LD := $(LIBS:%=-l%)
 | 
				
			||||
LD_GPU := $(LIBS_GPU:%=-l%)
 | 
				
			||||
 | 
				
			||||
# Reminder:
 | 
				
			||||
# $< = first prerequisite
 | 
				
			||||
# $@ = the target which matched the rule
 | 
				
			||||
# $^ = all prerequisites
 | 
				
			||||
 | 
				
			||||
.PHONY: all clean | 
				
			||||
 | 
				
			||||
all : cpu gpu | 
				
			||||
 | 
				
			||||
cpu: $(TARGET)CPU | 
				
			||||
gpu: $(TARGET)GPU | 
				
			||||
 | 
				
			||||
$(TARGET)CPU: $(CPU_OBJ) | 
				
			||||
	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 | 
				
			||||
 | 
				
			||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) | 
				
			||||
	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 | 
				
			||||
# regular ones. Then, we link them all together.
 | 
				
			||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) | 
				
			||||
	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) | 
				
			||||
	$(NVCC) --device-link $^ -o $@
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) | 
				
			||||
	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) | 
				
			||||
	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
-include $(CPU_DEPS) | 
				
			||||
-include $(GPU_DEPS) | 
				
			||||
 | 
				
			||||
$(CPU_BUILD_DIR): | 
				
			||||
	mkdir -p $@
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR): | 
				
			||||
	mkdir -p $@
 | 
				
			||||
 | 
				
			||||
clean: | 
				
			||||
	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
 | 
				
			||||
@ -0,0 +1,12 @@ | 
				
			||||
#define CUDATOOLS_IMPLEMENTATION | 
				
			||||
#include <Core.h> | 
				
			||||
 | 
				
			||||
DEFINE_KERNEL(add, int x, int y) { | 
				
			||||
    printf("Kernel: %i\n", x + y); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
int main() { | 
				
			||||
    KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
 | 
				
			||||
    return 0; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
@ -0,0 +1,95 @@ | 
				
			||||
CC := g++-10
 | 
				
			||||
NVCC := nvcc
 | 
				
			||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 | 
				
			||||
NVCC_FLAGS := -MMD -w -Xcompiler
 | 
				
			||||
 | 
				
			||||
INCLUDE := ../../
 | 
				
			||||
LIBS_DIR :=
 | 
				
			||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
 | 
				
			||||
LIBS :=
 | 
				
			||||
LIBS_GPU := cuda cudart cublas
 | 
				
			||||
 | 
				
			||||
TARGET = coreClass
 | 
				
			||||
SRC_DIR = .
 | 
				
			||||
BUILD_DIR = build
 | 
				
			||||
 | 
				
			||||
# Should not need to modify below.
 | 
				
			||||
 | 
				
			||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 | 
				
			||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 | 
				
			||||
 | 
				
			||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 | 
				
			||||
 | 
				
			||||
# Get source files and object files.
 | 
				
			||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 | 
				
			||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 | 
				
			||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | 
				
			||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | 
				
			||||
 | 
				
			||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
 | 
				
			||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 | 
				
			||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 | 
				
			||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 | 
				
			||||
 | 
				
			||||
# $(info $$GCC_SRC is [${GCC_SRC}])
 | 
				
			||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
 | 
				
			||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
 | 
				
			||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 | 
				
			||||
 | 
				
			||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
 | 
				
			||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 | 
				
			||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 | 
				
			||||
 | 
				
			||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 | 
				
			||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 | 
				
			||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 | 
				
			||||
 | 
				
			||||
INC := $(INCLUDE:%=-I%)
 | 
				
			||||
LIB := $(LIBS_DIR:%=-L%)
 | 
				
			||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 | 
				
			||||
LD := $(LIBS:%=-l%)
 | 
				
			||||
LD_GPU := $(LIBS_GPU:%=-l%)
 | 
				
			||||
 | 
				
			||||
# Reminder:
 | 
				
			||||
# $< = first prerequisite
 | 
				
			||||
# $@ = the target which matched the rule
 | 
				
			||||
# $^ = all prerequisites
 | 
				
			||||
 | 
				
			||||
.PHONY: all clean | 
				
			||||
 | 
				
			||||
all : cpu gpu | 
				
			||||
 | 
				
			||||
cpu: $(TARGET)CPU | 
				
			||||
gpu: $(TARGET)GPU | 
				
			||||
 | 
				
			||||
$(TARGET)CPU: $(CPU_OBJ) | 
				
			||||
	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 | 
				
			||||
 | 
				
			||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) | 
				
			||||
	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 | 
				
			||||
# regular ones. Then, we link them all together.
 | 
				
			||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) | 
				
			||||
	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) | 
				
			||||
	$(NVCC) --device-link $^ -o $@
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) | 
				
			||||
	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) | 
				
			||||
	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 | 
				
			||||
 | 
				
			||||
-include $(CPU_DEPS) | 
				
			||||
-include $(GPU_DEPS) | 
				
			||||
 | 
				
			||||
$(CPU_BUILD_DIR): | 
				
			||||
	mkdir -p $@
 | 
				
			||||
 | 
				
			||||
$(GPU_BUILD_DIR): | 
				
			||||
	mkdir -p $@
 | 
				
			||||
 | 
				
			||||
clean: | 
				
			||||
	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
 | 
				
			||||
@ -0,0 +1,34 @@ | 
				
			||||
#define CUDATOOLS_IMPLEMENTATION | 
				
			||||
#include <Core.h> | 
				
			||||
 | 
				
			||||
class intPair { | 
				
			||||
    DEVICE_CLASS(intPair) | 
				
			||||
    public: | 
				
			||||
        int x, y; | 
				
			||||
 | 
				
			||||
        intPair(const int x_, const int y_) : x(x_), y(y_) { | 
				
			||||
            allocateDevice(); // Allocates memory for this intPair on the device.
 | 
				
			||||
            updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
 | 
				
			||||
        }; | 
				
			||||
 | 
				
			||||
        HD void swap() { | 
				
			||||
            int swap = x; | 
				
			||||
            x = y; | 
				
			||||
            y = swap; | 
				
			||||
        }; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); } | 
				
			||||
 | 
				
			||||
int main() { | 
				
			||||
    intPair pair(1, 2); | 
				
			||||
    printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
 | 
				
			||||
 | 
				
			||||
    KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait(); | 
				
			||||
    pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
 | 
				
			||||
 | 
				
			||||
    printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
 | 
				
			||||
    return 0; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
@ -0,0 +1,494 @@ | 
				
			||||
#define CUDATOOLS_IMPLEMENTATION | 
				
			||||
#define CUDATOOLS_ARRAY_MAX_AXES 8 | 
				
			||||
#include "Array.h" | 
				
			||||
#include "BLAS.h" | 
				
			||||
#include "Core.h" | 
				
			||||
 | 
				
			||||
#include <Eigen/Core> | 
				
			||||
#include <chrono> | 
				
			||||
#include <complex> | 
				
			||||
 | 
				
			||||
namespace CT = CudaTools; | 
				
			||||
 | 
				
			||||
/////////////
 | 
				
			||||
// Helpers //
 | 
				
			||||
/////////////
 | 
				
			||||
 | 
				
			||||
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now() | 
				
			||||
 | 
				
			||||
#define TIME_END(name)                                                                             \ | 
				
			||||
    auto end_##name = std::chrono::steady_clock::now();                                            \
 | 
				
			||||
    auto time_ms_##name =                                                                          \
 | 
				
			||||
        std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count();  \
 | 
				
			||||
    auto time_mus_##name =                                                                         \
 | 
				
			||||
        std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count();  \
 | 
				
			||||
    if (time_ms_##name == 0) {                                                                     \
 | 
				
			||||
        printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name);                            \
 | 
				
			||||
    } else {                                                                                       \
 | 
				
			||||
        printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name);                             \
 | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
#define TIME(call, name)                                                                           \ | 
				
			||||
    TIME_START(name);                                                                              \
 | 
				
			||||
    call;                                                                                          \
 | 
				
			||||
    TIME_END(name); | 
				
			||||
 | 
				
			||||
#define TEST(predicate, name, msg)                                                                 \ | 
				
			||||
    failed += (predicate) ? 0 : 1;                                                                 \
 | 
				
			||||
    printf("[%s] ", (predicate) ? "\033[1;32mPASS\033[0m" : "\033[1;31mFAIL\033[0m");              \
 | 
				
			||||
    printf("%s | %s.\n", name, msg); | 
				
			||||
 | 
				
			||||
template <typename T> struct Type; | 
				
			||||
 | 
				
			||||
#define REGISTER_PARSE_TYPE(X)                                                                     \ | 
				
			||||
    template <> struct Type<X> { static const std::string name; };                                 \
 | 
				
			||||
    const std::string Type<X>::name = #X | 
				
			||||
 | 
				
			||||
REGISTER_PARSE_TYPE(uint8_t); | 
				
			||||
REGISTER_PARSE_TYPE(int16_t); | 
				
			||||
REGISTER_PARSE_TYPE(int32_t); | 
				
			||||
REGISTER_PARSE_TYPE(float); | 
				
			||||
REGISTER_PARSE_TYPE(double); | 
				
			||||
 | 
				
			||||
std::string box(std::string str) { | 
				
			||||
    std::string tops(str.size() + 6, '#'); | 
				
			||||
    return tops + "\n## " + str + " ##\n" + tops; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
std::string box2(std::string str) { | 
				
			||||
    std::string tops(str.size() - 5, '-'); | 
				
			||||
    return tops + "\n|| " + str + " ||\n" + tops; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
std::string boxSmall(std::string str) { | 
				
			||||
    std::string tops(6, '-'); | 
				
			||||
    return tops + "[ " + str + " ]" + tops; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
std::string separator() { | 
				
			||||
    std::string line(40, '='); | 
				
			||||
    return "\n" + line + "\n"; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
template <typename T> std::string type() { return "\033[1;96m" + Type<T>::name + "\033[0m"; } | 
				
			||||
 | 
				
			||||
CT::Shape makeRandom2DShape() { | 
				
			||||
    std::random_device rd; | 
				
			||||
    std::mt19937 mt(rd()); | 
				
			||||
    std::uniform_int_distribution<uint32_t> dist(1, 15); | 
				
			||||
    return CT::Shape({dist(mt), dist(mt)}); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
///////////
 | 
				
			||||
// Tests //
 | 
				
			||||
///////////
 | 
				
			||||
 | 
				
			||||
class TestClass { | 
				
			||||
    DEVICE_CLASS(TestClass); | 
				
			||||
 | 
				
			||||
  public: | 
				
			||||
    int x; | 
				
			||||
    TestClass(const int x) : x(x) { | 
				
			||||
        allocateDevice(); | 
				
			||||
        updateDevice().wait(); | 
				
			||||
    }; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
DEFINE_KERNEL(times, const CT::Array<int> arr) { | 
				
			||||
    BASIC_LOOP(arr.shape().length()) { arr[iThread] *= 2; } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
DEFINE_KERNEL(classTest, TestClass* const test) { test->x = 100; } | 
				
			||||
 | 
				
			||||
struct MacroTests { | 
				
			||||
    static uint32_t Kernel() { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        CT::Array<int> A = CT::Array<int>::constant({10}, 1); | 
				
			||||
        A.updateDevice().wait(); | 
				
			||||
        KERNEL(times, CT::Kernel::basic(A.shape().items()), A.view()).wait(); | 
				
			||||
        A.updateHost().wait(); | 
				
			||||
 | 
				
			||||
        uint32_t errors = 0; | 
				
			||||
        for (auto it = A.begin(); it != A.end(); ++it) { | 
				
			||||
            if (*it != 2) ++errors; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        std::ostringstream msg; | 
				
			||||
        msg << "Errors: " << errors; | 
				
			||||
        TEST(errors == 0, "Kernel", msg.str().c_str()); | 
				
			||||
        return failed; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static uint32_t Class() { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        TestClass test(1); | 
				
			||||
        KERNEL(classTest, CT::Kernel::basic(1), test.that()).wait(); | 
				
			||||
        test.updateHost().wait(); | 
				
			||||
 | 
				
			||||
        TEST(test.x == 100, "Class", "Errors: 0"); | 
				
			||||
        return failed; | 
				
			||||
    } | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
template <typename T> struct ArrayTests { | 
				
			||||
    static uint32_t Indexing() { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        CT::Array<T> A = CT::Array<T>::range(0, 240); | 
				
			||||
        A.reshape({5, 3, 1, 4, 2, 1, 1, 2}); | 
				
			||||
 | 
				
			||||
        uint32_t errors = 0; | 
				
			||||
        for (uint32_t i = 0; i < 5; ++i) { | 
				
			||||
            for (uint32_t j = 0; j < 3; ++j) { | 
				
			||||
                for (uint32_t k = 0; k < 4; ++k) { | 
				
			||||
                    for (uint32_t l = 0; l < 2; ++l) { | 
				
			||||
                        for (uint32_t m = 0; m < 2; ++m) { | 
				
			||||
                            if ((T)A[i][j][0][k][l][0][0][m] != (T)A[{i, j, 0, k, l, 0, 0, m}]) { | 
				
			||||
                                ++errors; | 
				
			||||
                            } | 
				
			||||
                        } | 
				
			||||
                    } | 
				
			||||
                } | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        std::ostringstream msg; | 
				
			||||
        msg << "Errors: " << errors; | 
				
			||||
        TEST(errors == 0, "Element", msg.str().c_str()); | 
				
			||||
 | 
				
			||||
        errors = 0; | 
				
			||||
        CT::Array<T> ApartGroup_1 = A[{2, 2}]; | 
				
			||||
        CT::Array<T> ApartIndiv_1 = A[2][2]; | 
				
			||||
        for (uint32_t k = 0; k < 4; ++k) { | 
				
			||||
            for (uint32_t l = 0; l < 2; ++l) { | 
				
			||||
                for (uint32_t m = 0; m < 2; ++m) { | 
				
			||||
                    if ((T)ApartIndiv_1[0][k][l][0][0][m] != (T)ApartGroup_1[{0, k, l, 0, 0, m}]) { | 
				
			||||
                        ++errors; | 
				
			||||
                    } | 
				
			||||
                } | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        msg.str(""); | 
				
			||||
        msg << "Errors: " << errors; | 
				
			||||
        TEST(errors == 0, "Axis (1/2)", msg.str().c_str()); | 
				
			||||
 | 
				
			||||
        errors = 0; | 
				
			||||
        CT::Array<T> ApartGroup_2 = A[{3, 2, 0, 3}]; | 
				
			||||
        CT::Array<T> ApartIndiv_2 = A[3][2][0][3]; | 
				
			||||
 | 
				
			||||
        for (uint32_t l = 0; l < 2; ++l) { | 
				
			||||
            for (uint32_t m = 0; m < 2; ++m) { | 
				
			||||
                if ((T)ApartIndiv_2[l][0][0][m] != (T)ApartGroup_2[{l, 0, 0, m}]) { | 
				
			||||
                    ++errors; | 
				
			||||
                } | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        msg.str(""); | 
				
			||||
        msg << "Errors: " << errors; | 
				
			||||
        TEST(errors == 0, "Axis (2/2)", msg.str().c_str()); | 
				
			||||
        return failed; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static uint32_t Slicing() { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        CT::Array<T> A = CT::Array<T>::constant({4, 5, 5}, 0); | 
				
			||||
 | 
				
			||||
        CT::Array<T> Aslice = A.slice({{0, 3}, {1, 4}, {1, 4}}); | 
				
			||||
        T num = (T)1; | 
				
			||||
        for (auto it = Aslice.begin(); it != Aslice.end(); ++it) { | 
				
			||||
            *it = num; | 
				
			||||
            ++num; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        CT::Array<T> Aslice2 = A[3].slice({{0, 5}, {0, 1}}); | 
				
			||||
        num = (T)-1; | 
				
			||||
        for (auto it = Aslice2.begin(); it != Aslice2.end(); ++it) { | 
				
			||||
            *it = num; | 
				
			||||
            --num; | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        uint32_t errors = 0; | 
				
			||||
        for (int i = 0; i < 3; ++i) { | 
				
			||||
            for (int j = 0; j < 3; ++j) { | 
				
			||||
                for (int k = 0; k < 3; ++k) { | 
				
			||||
                    if ((T)A[i][1 + j][1 + k] != (T)(9 * i + 3 * j + k + 1)) { | 
				
			||||
                        ++errors; | 
				
			||||
                    } | 
				
			||||
                } | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
        std::ostringstream msg; | 
				
			||||
        msg << "Errors: " << errors; | 
				
			||||
        TEST(errors == 0, "Block", msg.str().c_str()); | 
				
			||||
 | 
				
			||||
        errors = 0; | 
				
			||||
        for (int i = 0; i < 5; ++i) { | 
				
			||||
            if ((T)A[3][i][0] != (T)(-(i + 1))) { | 
				
			||||
                ++errors; | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        msg.str(""); | 
				
			||||
        msg << "Errors: " << errors; | 
				
			||||
        TEST(errors == 0, "Column", msg.str().c_str()); | 
				
			||||
        return failed; | 
				
			||||
    } | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
template <typename T> struct BLASTests { | 
				
			||||
    static double thres; | 
				
			||||
    static uint32_t GEMV(int attempts) { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        for (int i = 0; i < attempts; i++) { | 
				
			||||
            CT::Shape Ashape = makeRandom2DShape(); | 
				
			||||
            CT::Shape xshape = CT::Shape({Ashape.cols(), 1}); | 
				
			||||
            CT::Shape yshape = CT::Shape({Ashape.rows(), 1}); | 
				
			||||
 | 
				
			||||
            CT::Array<T> A(Ashape); | 
				
			||||
            CT::Array<T> x(xshape); | 
				
			||||
            CT::Array<T> y(yshape); | 
				
			||||
 | 
				
			||||
            A.setRandom(-100, 100); | 
				
			||||
            x.setRandom(-100, 100); | 
				
			||||
 | 
				
			||||
            A.updateDevice(); | 
				
			||||
            x.updateDevice().wait(); | 
				
			||||
 | 
				
			||||
            CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait(); | 
				
			||||
            y.updateHost().wait(); | 
				
			||||
 | 
				
			||||
            CT::Array<T> yTest(yshape, true); | 
				
			||||
            yTest.eigenMap() = A.eigenMap() * x.eigenMap(); | 
				
			||||
 | 
				
			||||
            double norm = (y.eigenMap() - y.eigenMap()).norm(); | 
				
			||||
 | 
				
			||||
            std::ostringstream name; | 
				
			||||
            name << "GEMV (" << i + 1 << "/" << attempts << ")"; | 
				
			||||
            std::ostringstream msg; | 
				
			||||
            msg << "Matrix Shape: " << Ashape << ", " | 
				
			||||
                << "Residual: " << norm; | 
				
			||||
            TEST(norm < thres, name.str().c_str(), msg.str().c_str()); | 
				
			||||
        } | 
				
			||||
        return failed; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static uint32_t GEMVBroadcast() { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        CT::Shape Ashape = makeRandom2DShape(); | 
				
			||||
        CT::Shape xshape = CT::Shape({Ashape.cols(), 1}); | 
				
			||||
        CT::Shape yshape = CT::Shape({Ashape.rows(), 1}); | 
				
			||||
 | 
				
			||||
        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()}); | 
				
			||||
        CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()}); | 
				
			||||
        CT::Array<T> y({2, 3, yshape.rows(), yshape.cols()}); | 
				
			||||
 | 
				
			||||
        A.setRandom(-100, 100); | 
				
			||||
        x.setRandom(-100, 100); | 
				
			||||
 | 
				
			||||
        A.updateDevice(); | 
				
			||||
        x.updateDevice().wait(); | 
				
			||||
 | 
				
			||||
        CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait(); | 
				
			||||
        y.updateHost().wait(); | 
				
			||||
 | 
				
			||||
        double norm = 0; | 
				
			||||
        CT::Array<T> yTest(yshape, true); | 
				
			||||
        for (int i = 0; i < 2; ++i) { | 
				
			||||
            for (int j = 0; j < 3; ++j) { | 
				
			||||
                yTest.eigenMap() = A[i][j].eigenMap() * x[i][j].eigenMap(); | 
				
			||||
                norm += (yTest.eigenMap() - y[i][j].eigenMap()).norm(); | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        std::ostringstream msg; | 
				
			||||
        msg << "Matrix Shape: " << Ashape << ", " | 
				
			||||
            << "Residual: " << norm; | 
				
			||||
        TEST(norm < thres, "GEMV Broadcast", msg.str().c_str()); | 
				
			||||
        return failed; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static uint32_t GEMM(int attempts) { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        for (int i = 0; i < attempts; i++) { | 
				
			||||
            CT::Shape Ashape = makeRandom2DShape(); | 
				
			||||
            CT::Shape Bshape = makeRandom2DShape(); | 
				
			||||
            Bshape = CT::Shape({Ashape.cols(), Bshape.cols()}); | 
				
			||||
 | 
				
			||||
            CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()}); | 
				
			||||
 | 
				
			||||
            CT::Array<T> A(Ashape); | 
				
			||||
            CT::Array<T> B(Bshape); | 
				
			||||
            CT::Array<T> C(Cshape); | 
				
			||||
 | 
				
			||||
            A.setRandom(-100, 100); | 
				
			||||
            B.setRandom(-100, 100); | 
				
			||||
            C.setRandom(-100, 100); | 
				
			||||
 | 
				
			||||
            A.updateDevice(); | 
				
			||||
            B.updateDevice(); | 
				
			||||
            C.updateDevice().wait(); | 
				
			||||
 | 
				
			||||
            CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait(); | 
				
			||||
            C.updateHost().wait(); | 
				
			||||
 | 
				
			||||
            CT::Array<T> CTest(Cshape, true); | 
				
			||||
            CTest.eigenMap() = A.eigenMap() * B.eigenMap(); | 
				
			||||
 | 
				
			||||
            double norm = (CTest.eigenMap() - C.eigenMap()).norm(); | 
				
			||||
 | 
				
			||||
            std::ostringstream name; | 
				
			||||
            name << "GEMM (" << i + 1 << "/" << attempts << ")"; | 
				
			||||
            std::ostringstream msg; | 
				
			||||
            msg << "Matrix Shapes: " << Ashape << Bshape << ", " | 
				
			||||
                << "Residual: " << norm; | 
				
			||||
            TEST(norm < thres, name.str().c_str(), msg.str().c_str()); | 
				
			||||
        } | 
				
			||||
        return failed; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static uint32_t GEMMBroadcast() { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        CT::Shape Ashape = makeRandom2DShape(); | 
				
			||||
        CT::Shape Bshape = makeRandom2DShape(); | 
				
			||||
        Bshape = CT::Shape({Ashape.cols(), Bshape.cols()}); | 
				
			||||
 | 
				
			||||
        CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()}); | 
				
			||||
 | 
				
			||||
        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()}); | 
				
			||||
        CT::Array<T> B({2, 3, Bshape.rows(), Bshape.cols()}); | 
				
			||||
        CT::Array<T> C({2, 3, Cshape.rows(), Cshape.cols()}); | 
				
			||||
 | 
				
			||||
        A.setRandom(-100, 100); | 
				
			||||
        B.setRandom(-100, 100); | 
				
			||||
 | 
				
			||||
        A.updateDevice(); | 
				
			||||
        B.updateDevice(); | 
				
			||||
        C.updateDevice().wait(); | 
				
			||||
 | 
				
			||||
        CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait(); | 
				
			||||
        C.updateHost().wait(); | 
				
			||||
 | 
				
			||||
        double norm = 0; | 
				
			||||
        CT::Array<T> CTest(Cshape, true); | 
				
			||||
        for (int i = 0; i < 2; ++i) { | 
				
			||||
            for (int j = 0; j < 3; ++j) { | 
				
			||||
                CTest.eigenMap() = A[i][j].eigenMap() * B[i][j].eigenMap(); | 
				
			||||
                norm += (CTest.eigenMap() - C[i][j].eigenMap()).norm(); | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        std::ostringstream msg; | 
				
			||||
        msg << "Matrix Shapes: " << Ashape << Bshape << ", " | 
				
			||||
            << "Residual: " << norm; | 
				
			||||
        TEST(norm < thres, "GEMM Broadcast", msg.str().c_str()); | 
				
			||||
        return failed; | 
				
			||||
    }; | 
				
			||||
 | 
				
			||||
    static uint32_t PLU() { | 
				
			||||
        uint32_t failed = 0; | 
				
			||||
        CT::Shape Ashape = makeRandom2DShape(); | 
				
			||||
        CT::Shape xshape = makeRandom2DShape(); | 
				
			||||
        Ashape = CT::Shape({Ashape.rows(), Ashape.rows()}); | 
				
			||||
        xshape = CT::Shape({Ashape.rows(), xshape.cols()}); | 
				
			||||
 | 
				
			||||
        CT::Array<T> A({2, 3, Ashape.rows(), Ashape.rows()}); | 
				
			||||
        CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()}); | 
				
			||||
        CT::Array<T> b({2, 3, xshape.rows(), xshape.cols()}); | 
				
			||||
        CT::Array<T> Ax({2, 3, xshape.rows(), xshape.cols()}); | 
				
			||||
 | 
				
			||||
        A.setRandom(-100, 100); | 
				
			||||
        b.setRandom(-100, 100); | 
				
			||||
 | 
				
			||||
        CT::Array<T> LU(A.copy()); | 
				
			||||
        x = b; | 
				
			||||
 | 
				
			||||
        A.updateDevice(); | 
				
			||||
        LU.updateDevice(); | 
				
			||||
        x.updateDevice().wait(); | 
				
			||||
 | 
				
			||||
        CT::BLAS::PLUBatch<T> luBatch(LU); | 
				
			||||
        CT::BLAS::Batch<T> xBatch(x); | 
				
			||||
        luBatch.computeLU().wait(); | 
				
			||||
        luBatch.solve(xBatch).wait(); | 
				
			||||
 | 
				
			||||
        // Compute Ax and compare difference.
 | 
				
			||||
        CT::BLAS::GEMM<T>(1.0, A, x, 0.0, Ax).wait(); | 
				
			||||
        Ax.updateHost(); | 
				
			||||
 | 
				
			||||
        double norm = 0; | 
				
			||||
        for (int i = 0; i < 2; ++i) { | 
				
			||||
            for (int j = 0; j < 3; ++j) { | 
				
			||||
                norm += (Ax[i][j].eigenMap() - b[i][j].eigenMap()).norm(); | 
				
			||||
            } | 
				
			||||
        } | 
				
			||||
 | 
				
			||||
        std::ostringstream msg; | 
				
			||||
        msg << "Matrix Shape: " << Ashape << xshape << ", " | 
				
			||||
            << "Residual: " << norm; | 
				
			||||
        TEST(norm < thres, "PLU/Solve", msg.str().c_str()); | 
				
			||||
        return failed; | 
				
			||||
    } | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
template <> double BLASTests<float>::thres = 10e-1; | 
				
			||||
template <> double BLASTests<double>::thres = 10e-8; | 
				
			||||
 | 
				
			||||
uint32_t doMacroTests() { | 
				
			||||
    uint32_t failed = 0; | 
				
			||||
    failed += MacroTests::Kernel(); | 
				
			||||
    failed += MacroTests::Class(); | 
				
			||||
    std::cout << "\n"; | 
				
			||||
    return failed; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
template <typename T> uint32_t doArrayTests() { | 
				
			||||
    uint32_t failed = 0; | 
				
			||||
    std::cout << boxSmall("Index Tests : " + type<T>()) << "\n"; | 
				
			||||
    failed += ArrayTests<T>::Indexing(); | 
				
			||||
    std::cout << "\n" << boxSmall("Slice Tests : " + type<T>()) << "\n"; | 
				
			||||
    failed += ArrayTests<T>::Slicing(); | 
				
			||||
    std::cout << "\n"; | 
				
			||||
    return failed; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
template <typename T> uint32_t doBLASTests() { | 
				
			||||
    uint32_t failed = 0; | 
				
			||||
    std::cout << boxSmall("GEMV Tests : " + type<T>()) << "\n"; | 
				
			||||
    failed += BLASTests<T>::GEMV(5); | 
				
			||||
    failed += BLASTests<T>::GEMVBroadcast(); | 
				
			||||
 | 
				
			||||
    std::cout << "\n" << boxSmall("GEMM Tests : " + type<T>()) << "\n"; | 
				
			||||
    failed += BLASTests<T>::GEMM(5); | 
				
			||||
    failed += BLASTests<T>::GEMMBroadcast(); | 
				
			||||
 | 
				
			||||
    std::cout << "\n" << boxSmall("PLU Tests : " + type<T>()) << "\n"; | 
				
			||||
    failed += BLASTests<T>::PLU(); | 
				
			||||
    std::cout << "\n"; | 
				
			||||
    return failed; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
int main() { | 
				
			||||
    uint32_t failed = 0; | 
				
			||||
    std::cout << box("Macro Tests") << "\n"; | 
				
			||||
    failed += doMacroTests(); | 
				
			||||
 | 
				
			||||
    std::cout << box("Array Tests") << "\n"; | 
				
			||||
    // Test different sizes.
 | 
				
			||||
    failed += doArrayTests<uint8_t>(); | 
				
			||||
    failed += doArrayTests<int16_t>(); | 
				
			||||
    failed += doArrayTests<int32_t>(); | 
				
			||||
    failed += doArrayTests<double>(); | 
				
			||||
 | 
				
			||||
    std::cout << box("BLAS Tests") << "\n"; | 
				
			||||
    failed += doBLASTests<float>(); | 
				
			||||
    failed += doBLASTests<double>(); | 
				
			||||
 | 
				
			||||
    constexpr uint32_t tests = 2 + 4 * 5 + 13 * 2; | 
				
			||||
    std::ostringstream msg; | 
				
			||||
    msg << ((failed == 0) ? "\033[1;32mPASS \033[0m(" : "\033[1;31mFAIL \033[0m(") | 
				
			||||
        << (tests - failed) << "/" << tests << ")"; | 
				
			||||
    std::cout << box2(msg.str()) << "\n"; | 
				
			||||
 | 
				
			||||
    return 0; | 
				
			||||
} | 
				
			||||
					Loading…
					
					
				
		Reference in new issue