commit
b4e4a49d44
26 changed files with 6120 additions and 0 deletions
@ -0,0 +1,10 @@ |
|||||||
|
--- |
||||||
|
BasedOnStyle: LLVM |
||||||
|
IndentWidth: 4 |
||||||
|
ColumnLimit: 100 |
||||||
|
AllowShortIfStatementsOnASingleLine: true |
||||||
|
--- |
||||||
|
Language: Cpp |
||||||
|
DerivePointerAlignment: false |
||||||
|
PointerAlignment: Left |
||||||
|
--- |
@ -0,0 +1,4 @@ |
|||||||
|
build |
||||||
|
*CPU |
||||||
|
*GPU |
||||||
|
.venv |
@ -0,0 +1,777 @@ |
|||||||
|
#ifndef ARRAY_H |
||||||
|
#define ARRAY_H |
||||||
|
|
||||||
|
#include "Core.h" |
||||||
|
#include "Macros.h" |
||||||
|
#include <Eigen/Dense> |
||||||
|
#include <iomanip> |
||||||
|
#include <math.h> |
||||||
|
#include <random> |
||||||
|
#include <type_traits> |
||||||
|
|
||||||
|
#ifdef DEVICE |
||||||
|
#define POINTER pDevice |
||||||
|
#else |
||||||
|
#define POINTER pHost |
||||||
|
#endif |
||||||
|
|
||||||
|
namespace CudaTools { |
||||||
|
|
||||||
|
template <typename T> |
||||||
|
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>; |
||||||
|
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>; |
||||||
|
template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>; |
||||||
|
|
||||||
|
template <typename T> struct EigenAdaptConst { typedef EigenMapMat<T> type; }; |
||||||
|
template <typename T> struct EigenAdaptConst<const T> { typedef ConstEigenMapMat<T> type; }; |
||||||
|
|
||||||
|
#define ENABLE_IF(X) std::enable_if_t<X, bool> |
||||||
|
#define IS_INT(T) std::is_integral<T>::value |
||||||
|
#define IS_FLOAT(T) std::is_floating_point<T>::value |
||||||
|
#define IS_NUM(T) IS_INT(T) or IS_FLOAT(T) |
||||||
|
|
||||||
|
template <typename T> class Array; |
||||||
|
using Slice = std::pair<uint32_t, uint32_t>; |
||||||
|
|
||||||
|
template <typename T> class ArrayIterator { |
||||||
|
private: |
||||||
|
template <typename U> |
||||||
|
friend std::ostream& operator<<(std::ostream& out, const ArrayIterator<U>& it); |
||||||
|
T* pData; |
||||||
|
Shape mShape; |
||||||
|
uint32_t mIndices[CUDATOOLS_ARRAY_MAX_AXES] = {0}; |
||||||
|
|
||||||
|
public: |
||||||
|
HD ArrayIterator(T* p, const Shape& shape) : pData(p), mShape(shape){}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Moves the iterator to the next value. |
||||||
|
*/ |
||||||
|
HD void next() { |
||||||
|
bool carry = false; |
||||||
|
uint32_t offset = 0; |
||||||
|
for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { |
||||||
|
if (mIndices[iAxis] == mShape.dim(iAxis) - 1) { |
||||||
|
mIndices[iAxis] = 0; |
||||||
|
offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); |
||||||
|
carry = true; |
||||||
|
} else { |
||||||
|
pData += mShape.stride(iAxis); |
||||||
|
mIndices[iAxis] += 1; |
||||||
|
carry = false; |
||||||
|
} |
||||||
|
|
||||||
|
if (not carry) { |
||||||
|
pData -= offset; |
||||||
|
return; |
||||||
|
} |
||||||
|
} |
||||||
|
pData += 1; // "Overflow" occured, so we reached end of array.
|
||||||
|
} |
||||||
|
|
||||||
|
/**
|
||||||
|
* Moves the iterator to the previous value. |
||||||
|
*/ |
||||||
|
HD void prev() { |
||||||
|
bool carry = false; |
||||||
|
uint32_t offset = 0; |
||||||
|
for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { |
||||||
|
if (mIndices[iAxis] == 0) { |
||||||
|
mIndices[iAxis] = mShape.dim(iAxis) - 1; |
||||||
|
offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); |
||||||
|
carry = true; |
||||||
|
} else { |
||||||
|
pData -= mShape.stride(iAxis); |
||||||
|
mIndices[iAxis] += 1; |
||||||
|
carry = false; |
||||||
|
} |
||||||
|
if (not carry) { |
||||||
|
pData += offset; |
||||||
|
return; |
||||||
|
} |
||||||
|
} |
||||||
|
pData -= 1; |
||||||
|
} |
||||||
|
|
||||||
|
/**
|
||||||
|
* Moves the iterator a specified value away. |
||||||
|
* \param amount the amount to advance by |
||||||
|
*/ |
||||||
|
HD void advance(const int32_t amount) { |
||||||
|
if (amount < 0) { |
||||||
|
for (uint32_t i = 0; i < abs(amount); ++i) { |
||||||
|
prev(); |
||||||
|
} |
||||||
|
} else { |
||||||
|
for (uint32_t i = 0; i < abs(amount); ++i) { |
||||||
|
next(); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
HD void operator++() { next(); }; /**< Prefix increment operator. */ |
||||||
|
HD void operator--() { prev(); }; /**< Prefix decrement operator. */ |
||||||
|
|
||||||
|
/**< Addition operator. */ |
||||||
|
HD ArrayIterator<T> operator+(const int32_t v) const { |
||||||
|
ArrayIterator<T> it = *this; |
||||||
|
it.advance(v); |
||||||
|
return it; |
||||||
|
}; |
||||||
|
|
||||||
|
/** Subtraction operator.*/ |
||||||
|
HD ArrayIterator<T> operator-(const int32_t v) const { |
||||||
|
ArrayIterator<T> it = *this; |
||||||
|
it.advance(-v); |
||||||
|
return it; |
||||||
|
}; |
||||||
|
HD void operator+=(const int32_t v) { advance(v); }; |
||||||
|
HD void operator-=(const int32_t v) { advance(-v); }; |
||||||
|
|
||||||
|
HD T& operator*() { return *pData; }; /**< Dereference operator. */ |
||||||
|
HD const T& operator*() const { return *pData; }; /**< Const dereference operator. */ |
||||||
|
|
||||||
|
/**
|
||||||
|
* Equals operator. |
||||||
|
*/ |
||||||
|
HD bool operator==(const ArrayIterator<T>& it) { return pData == it.pData; } |
||||||
|
|
||||||
|
/**
|
||||||
|
* Not equals operator. |
||||||
|
*/ |
||||||
|
HD bool operator!=(const ArrayIterator<T>& it) { return pData != it.pData; } |
||||||
|
}; |
||||||
|
|
||||||
|
template <typename T> std::ostream& operator<<(std::ostream& out, const ArrayIterator<T>& it) { |
||||||
|
return out << it.pData; |
||||||
|
} |
||||||
|
|
||||||
|
template <typename T> class ArrayLoader { |
||||||
|
private: |
||||||
|
ArrayIterator<T> mIterator; |
||||||
|
ArrayIterator<T> mIteratorEnd; |
||||||
|
|
||||||
|
public: |
||||||
|
HD ArrayLoader(const ArrayIterator<T>& it, const ArrayIterator<T>& it_end) |
||||||
|
: mIterator(it), mIteratorEnd(it_end){}; |
||||||
|
HD ArrayLoader &operator,(const T value) { |
||||||
|
CT_ERROR_IF(mIterator, ==, mIteratorEnd, "Cannot assign more values than Array size"); |
||||||
|
*mIterator = value; |
||||||
|
++mIterator; |
||||||
|
return *this; |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* A container that holds a N-dimensional array, stored column major. To set the |
||||||
|
* maximum N, there is a compiler macro CUDATOOLS_ARRAY_MAX_DIM whose default value is 4. |
||||||
|
* It adapts to operations between host and device to ease memory management. |
||||||
|
*/ |
||||||
|
template <typename T> class Array { |
||||||
|
private: |
||||||
|
template <typename U> friend std::ostream& operator<<(std::ostream&, const Array<U>&); |
||||||
|
|
||||||
|
Shape mShape; |
||||||
|
T* pHost = nullptr; |
||||||
|
T* pDevice = nullptr; |
||||||
|
|
||||||
|
bool mIsView = false; |
||||||
|
bool mIsSlice = false; |
||||||
|
|
||||||
|
uint32_t mEndOffset = 0; |
||||||
|
|
||||||
|
void freeArrays() { |
||||||
|
#ifndef DEVICE |
||||||
|
if (not mIsView) { |
||||||
|
if (pDevice != nullptr) CudaTools::free(pDevice); |
||||||
|
if (pHost != nullptr) delete[] pHost; |
||||||
|
} |
||||||
|
#endif |
||||||
|
}; |
||||||
|
|
||||||
|
HD void calcEnd() { |
||||||
|
uint32_t offset = 0; |
||||||
|
for (uint32_t i = 0; i < shape().axes(); ++i) { |
||||||
|
offset += (shape().dim(i) - 1) * shape().stride(i); |
||||||
|
} |
||||||
|
mEndOffset = offset + 1; |
||||||
|
}; |
||||||
|
|
||||||
|
public: |
||||||
|
HD Array() = default; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for an Array that creates an allocates an array with |
||||||
|
* the specified Shape. Construction in this format is disabled on the device. |
||||||
|
* \brief Host only |
||||||
|
* \param shape the shape of the array |
||||||
|
* \param noDevice whether to initialize the array on the device |
||||||
|
*/ |
||||||
|
Array(const Shape& shape, const bool noDevice = false) : mShape(shape), mIsView(false) { |
||||||
|
pHost = new T[shape.items()]; |
||||||
|
calcEnd(); |
||||||
|
if (noDevice) return; |
||||||
|
pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for an Array from an existing (preallocated) pointer. |
||||||
|
* \param pointer the pointer to use |
||||||
|
* \param shape the shape of the array |
||||||
|
* \param noDevice whether to initialize the array on the device |
||||||
|
*/ |
||||||
|
HD Array(T* const pointer, const Shape& shape, const bool noDevice = false) |
||||||
|
: mShape(shape), mIsView(true), mIsSlice(false) { |
||||||
|
POINTER = pointer; |
||||||
|
calcEnd(); |
||||||
|
#ifndef DEVICE |
||||||
|
if (noDevice) return; |
||||||
|
pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); |
||||||
|
#endif |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for making a Array view from another Array, |
||||||
|
* given an offset and shape. |
||||||
|
* \param arr the original Array |
||||||
|
* \param shape the shape of the new array |
||||||
|
* \param offset the index where to start the a view of the array |
||||||
|
*/ |
||||||
|
HD Array(const Array& arr, const Shape& shape, const uint32_t offset = 0) |
||||||
|
: mShape(shape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(true), |
||||||
|
mIsSlice(arr.mIsSlice) { |
||||||
|
calcEnd(); |
||||||
|
if (pHost != nullptr) pHost += offset; |
||||||
|
if (pDevice != nullptr) pDevice += offset; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* The copy-constructor for a Array. If this is not a view, a deep copy |
||||||
|
* of the data will be performed on both host and device. On the device, it is always |
||||||
|
* treated like a view. |
||||||
|
*/ |
||||||
|
HD Array(const Array& arr) : mShape(arr.mShape), mIsView(arr.mIsView), mIsSlice(arr.mIsSlice) { |
||||||
|
calcEnd(); |
||||||
|
if (mIsView) { // If the other array was a view (and now this one), just assign.
|
||||||
|
pHost = arr.pHost; |
||||||
|
pDevice = arr.pDevice; |
||||||
|
return; |
||||||
|
} |
||||||
|
|
||||||
|
// Otherwise, we assume this is needs to own data.
|
||||||
|
pHost = new T[mShape.items()]; |
||||||
|
auto arr_it = arr.begin(); |
||||||
|
for (auto it = begin(); it != end(); ++it) { |
||||||
|
*it = *arr_it; |
||||||
|
++arr_it; |
||||||
|
} |
||||||
|
|
||||||
|
#ifndef DEVICE |
||||||
|
if (arr.pDevice != nullptr) { |
||||||
|
pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); |
||||||
|
} |
||||||
|
#endif |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* The move-constructor for a Array. |
||||||
|
*/ |
||||||
|
HD Array(Array&& arr) |
||||||
|
: mShape(arr.mShape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(arr.mIsView), |
||||||
|
mIsSlice(arr.mIsSlice) { |
||||||
|
calcEnd(); |
||||||
|
// Make other object empty.
|
||||||
|
arr.pHost = nullptr; |
||||||
|
arr.pDevice = nullptr; |
||||||
|
arr.mIsView = true; |
||||||
|
}; |
||||||
|
|
||||||
|
HD ~Array() { freeArrays(); }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* The copy-assignment operator for a Array. If this is not a view, |
||||||
|
* then the currently owned data will be freed, and a deep copy of the data will |
||||||
|
* be performed on both host and device. On the device, it is always treated like a view. |
||||||
|
*/ |
||||||
|
HD Array& operator=(const Array& arr) { |
||||||
|
if (this == &arr) return *this; |
||||||
|
|
||||||
|
if (mIsView) { // If this array is a view, we assign data from the right-hand side.
|
||||||
|
auto arr_it = arr.begin(); |
||||||
|
for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { |
||||||
|
*it = *arr_it; |
||||||
|
++arr_it; |
||||||
|
} |
||||||
|
return *this; |
||||||
|
} |
||||||
|
|
||||||
|
// Otherwise, it is implied to be object reassignment.
|
||||||
|
mShape = arr.mShape; |
||||||
|
mIsView = arr.mIsView; |
||||||
|
mIsSlice = arr.mIsSlice; |
||||||
|
calcEnd(); |
||||||
|
|
||||||
|
// Regardless if the right-hand side is a view, we create a new copy.
|
||||||
|
// In case that the right-hand side is a view of this array, we
|
||||||
|
// allocate memory to copy first. Keep in mind that the right-hand side
|
||||||
|
// array will then become undefined.
|
||||||
|
|
||||||
|
// We can only do this on the host.
|
||||||
|
#ifndef DEVICE |
||||||
|
T* new_pDevice = nullptr; |
||||||
|
if (pDevice != nullptr) { |
||||||
|
new_pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); |
||||||
|
} |
||||||
|
|
||||||
|
T* new_pHost = new T[mShape.items()]; |
||||||
|
memcpy(new_pHost, arr.pHost, mShape.items() * sizeof(T)); |
||||||
|
|
||||||
|
freeArrays(); |
||||||
|
pHost = new_pHost; |
||||||
|
pDevice = new_pDevice; |
||||||
|
#else |
||||||
|
pHost = arr.pHost; |
||||||
|
pDevice = arr.pDevice; |
||||||
|
#endif |
||||||
|
return *this; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* The move-assignment operator for a Array. |
||||||
|
*/ |
||||||
|
HD Array& operator=(Array&& arr) { |
||||||
|
if (this == &arr) return *this; |
||||||
|
|
||||||
|
if (mIsView) { // If this array is a view, we assign data from the right-hand side.
|
||||||
|
auto arr_it = arr.begin(); |
||||||
|
for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { |
||||||
|
*it = *arr_it; |
||||||
|
++arr_it; |
||||||
|
} |
||||||
|
return *this; |
||||||
|
} |
||||||
|
|
||||||
|
CT_ERROR(arr.mIsView, |
||||||
|
"Cannot move-assign view to a non-view (owner). This would lead to undefined " |
||||||
|
"behavior."); |
||||||
|
|
||||||
|
// Otherwise, it is implied to be object reassignment.
|
||||||
|
freeArrays(); |
||||||
|
mShape = arr.mShape; |
||||||
|
pHost = arr.pHost; |
||||||
|
pDevice = arr.pDevice; |
||||||
|
mIsView = arr.mIsView; |
||||||
|
mIsSlice = arr.mIsSlice; |
||||||
|
calcEnd(); |
||||||
|
|
||||||
|
// Make other array empty.
|
||||||
|
arr.pHost = nullptr; |
||||||
|
arr.pDevice = nullptr; |
||||||
|
arr.mIsView = true; |
||||||
|
return *this; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Used for indexing the Array. |
||||||
|
* \param index index of the first dimension |
||||||
|
*/ |
||||||
|
HD Array operator[](const uint32_t index) const { |
||||||
|
CT_ERROR_IF(index, >=, shape().dim(0), "Index exceeds axis size"); |
||||||
|
return Array(*this, shape().subshape(1), index * shape().stride(0)); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Used for indexing the Array. |
||||||
|
* \param indices a list of indices to index the Array |
||||||
|
*/ |
||||||
|
HD Array operator[](const std::initializer_list<uint32_t> indices) const { |
||||||
|
CT_ERROR_IF(indices.size(), >, shape().axes(), |
||||||
|
"Number of indices cannot exceed number of axes"); |
||||||
|
auto it = indices.begin(); |
||||||
|
uint offset = 0; |
||||||
|
for (uint32_t i = 0; i < indices.size(); ++i) { |
||||||
|
uint32_t index = *it; |
||||||
|
CT_ERROR_IF(index, >=, shape().dim(i), "Index exceeds axis size"); |
||||||
|
offset += index * shape().stride(i); |
||||||
|
++it; |
||||||
|
} |
||||||
|
return Array(*this, shape().subshape(indices.size()), offset); |
||||||
|
}; |
||||||
|
|
||||||
|
HD ArrayLoader<T> operator<<(const T value) { |
||||||
|
auto it = begin(); |
||||||
|
*it = value; |
||||||
|
++it; |
||||||
|
return ArrayLoader<T>(it, end()); |
||||||
|
}; |
||||||
|
|
||||||
|
HD T operator=(const T& value) { return POINTER[0] = value; }; |
||||||
|
HD operator T&() { return POINTER[0]; }; |
||||||
|
HD operator const T&() const { return POINTER[0]; }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Used to create slices of the Array. |
||||||
|
* \param slices a list of slices to slice the Array |
||||||
|
*/ |
||||||
|
HD Array slice(const std::initializer_list<Slice> slices) const { |
||||||
|
CT_ERROR_IF(slices.size(), >, shape().axes(), |
||||||
|
"Number of slices cannot exceed number of axes"); |
||||||
|
|
||||||
|
uint offset = 0; |
||||||
|
Shape new_shape = mShape; |
||||||
|
auto it = slices.begin(); |
||||||
|
for (uint32_t i = 0; i < slices.size(); ++i) { |
||||||
|
uint32_t from_index = it->first; |
||||||
|
uint32_t to_index = it->second; |
||||||
|
CT_ERROR_IF(from_index, >, to_index, |
||||||
|
"Slice start cannot be greater than than slice end"); |
||||||
|
CT_ERROR_IF(from_index, >=, shape().dim(i), "Slice start exceeds axis size"); |
||||||
|
CT_ERROR_IF(to_index - 1, >=, shape().dim(i), "Slice end exceeds axis size"); |
||||||
|
|
||||||
|
offset += from_index * shape().stride(i); |
||||||
|
new_shape.mAxisDim[i] = to_index - from_index; |
||||||
|
++it; |
||||||
|
} |
||||||
|
new_shape.mItems = 1; |
||||||
|
for (uint32_t i = 0; i < shape().axes(); ++i) { |
||||||
|
new_shape.mItems *= new_shape.dim(i); |
||||||
|
} |
||||||
|
|
||||||
|
Array<T> arr(*this, new_shape, offset); |
||||||
|
arr.mIsSlice = true; |
||||||
|
return arr; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns this Array with a different Shape. Its self assigning version is reshape. |
||||||
|
* If this Array is a slice of another, then it will perform a deep copy, and return |
||||||
|
* a new non-view array. |
||||||
|
*/ |
||||||
|
HD Array reshaped(const Shape& new_shape) const { |
||||||
|
CT_ERROR_IF(shape().items(), !=, new_shape.items(), |
||||||
|
"New shape cannot have a different number of terms"); |
||||||
|
if (mIsSlice) { |
||||||
|
Array<T> arr = this->copy(); |
||||||
|
return arr.reshaped(new_shape); |
||||||
|
} |
||||||
|
Array<T> arr = view(); |
||||||
|
arr.mShape = new_shape; |
||||||
|
return arr; |
||||||
|
}; |
||||||
|
|
||||||
|
HD void reshape(const Shape& new_shape) { |
||||||
|
CT_ERROR_IF(shape().items(), !=, new_shape.items(), |
||||||
|
"New shape cannot have a different number of terms"); |
||||||
|
CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try reshaped instead)") |
||||||
|
mShape = new_shape; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a view that is has at least two dimensions. Useful for promoting |
||||||
|
* single vectors to their 2D counterparts. |
||||||
|
*/ |
||||||
|
HD Array atLeast2D() const { |
||||||
|
return (shape().axes() == 1) ? Array(*this, {shape().length(), 1}) : view(); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Flattens the Array into one dimension. |
||||||
|
*/ |
||||||
|
HD Array flatten() const { return reshape({mShape.mItems}); }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the Eigen::Map of this Array. |
||||||
|
*/ |
||||||
|
typename EigenAdaptConst<T>::type eigenMap() const { |
||||||
|
uint32_t total_dim = mShape.mAxes; |
||||||
|
CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices") |
||||||
|
CT_ERROR_IF(total_dim, !=, 2, |
||||||
|
"Mapping to an Eigen array can only occur on two-dimensional arrays"); |
||||||
|
return typename EigenAdaptConst<T>::type(POINTER, mShape.rows(), mShape.cols()); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the Shape of the Array. |
||||||
|
*/ |
||||||
|
HD Shape shape() const { return mShape; }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the pointer to this array, depending on host or device. |
||||||
|
*/ |
||||||
|
HD T* data() const { return POINTER; }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the device pointer regardless of host or device. |
||||||
|
*/ |
||||||
|
HD T* dataDevice() const { return pDevice; }; |
||||||
|
|
||||||
|
HD bool isView() const { return mIsView; }; /**< Gets whether this Array is a view. */ |
||||||
|
HD bool isSlice() const { return mIsSlice; }; /**< Gets whether this Array is a slice. */ |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a view of this Array. |
||||||
|
*/ |
||||||
|
HD Array view() const { return Array(*this, mShape); } |
||||||
|
|
||||||
|
/**
|
||||||
|
* Copies this Array and returns a new Array with the same memory. |
||||||
|
*/ |
||||||
|
HD Array copy() const { |
||||||
|
Array<T> arr(mShape, (pDevice == nullptr)); |
||||||
|
|
||||||
|
auto arr_it = arr.begin(); |
||||||
|
for (auto it = begin(); it != end(); ++it) { |
||||||
|
*arr_it = *it; |
||||||
|
++arr_it; |
||||||
|
} |
||||||
|
#ifndef DEVICE |
||||||
|
if (pDevice != nullptr) { |
||||||
|
CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait(); |
||||||
|
} |
||||||
|
#endif |
||||||
|
return arr; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the iterator to the beginning of this Array. |
||||||
|
*/ |
||||||
|
HD ArrayIterator<T> begin() const { return ArrayIterator<T>(POINTER, mShape); }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the iterator to the end of this Array. |
||||||
|
*/ |
||||||
|
HD ArrayIterator<T> end() const { return ArrayIterator<T>(POINTER + mEndOffset, mShape); }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the values of the entire Array to a constant. This is restricted to numerical types. |
||||||
|
*/ |
||||||
|
HD void setConstant(const T value) const { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
for (auto it = begin(); it != end(); ++it) { |
||||||
|
*it = value; |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the Array values with uniform random values in a specified range. This is restricted to |
||||||
|
* numerical types. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
void setRandom(const T min, const T max) const { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); |
||||||
|
std::random_device rd; |
||||||
|
std::mt19937 mt(rd()); |
||||||
|
if constexpr (IS_INT(T)) { |
||||||
|
std::uniform_int_distribution<T> dist(min, max); |
||||||
|
for (auto it = begin(); it != end(); ++it) { |
||||||
|
*it = dist(mt); |
||||||
|
} |
||||||
|
} else if constexpr (IS_FLOAT(T)) { |
||||||
|
std::uniform_real_distribution<T> dist(min, max); |
||||||
|
for (auto it = begin(); it != end(); ++it) { |
||||||
|
*it = dist(mt); |
||||||
|
} |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the Array values to start from a value and increment by a specified step. This is |
||||||
|
* restricted to numerical types. |
||||||
|
*/ |
||||||
|
HD void setRange(T min, const T step = 1) const { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
for (auto it = begin(); it != end(); ++it) { |
||||||
|
*it = min; |
||||||
|
min += step; |
||||||
|
} |
||||||
|
} |
||||||
|
/**
|
||||||
|
* Sets the Array values to be evenly spaced numbers over a given interval. This is restricted |
||||||
|
* to floating point types. |
||||||
|
*/ |
||||||
|
HD void setLinspace(const T min, const T max) const { |
||||||
|
static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); |
||||||
|
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); |
||||||
|
T i = 0; |
||||||
|
T d = max - min; |
||||||
|
T items = (T)(shape().items() - 1); |
||||||
|
for (auto it = begin(); it != end(); ++it) { |
||||||
|
*it = min + d * (i / items); |
||||||
|
i += 1; |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns array of given shape with constant values. This is restricted to numerical types. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
static Array constant(const Shape& shape, const T value) { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
Array<T> arr(shape); |
||||||
|
arr.setConstant(value); |
||||||
|
return arr; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns array of given shape with random values in given interval. This is restricted to |
||||||
|
* numerical types. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
static Array random(const Shape& shape, const T min, const T max) { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
Array<T> arr(shape); |
||||||
|
arr.setRandom(min, max); |
||||||
|
return arr; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns evenly spaced values within a given interval. This is restricted to numerical types. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
static Array range(const T min, const T max, const T step = 1) { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); |
||||||
|
Array<T> arr({(uint32_t)((max - min) / step)}); |
||||||
|
arr.setRange(min, step); |
||||||
|
return arr; |
||||||
|
} |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns evenly spaced values within a given interval. This is restricted to floating point |
||||||
|
* types. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
static Array linspace(const T min, const T max, const uint32_t size) { |
||||||
|
static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); |
||||||
|
Array<T> arr({size}); |
||||||
|
arr.setLinspace(min, max); |
||||||
|
return arr; |
||||||
|
} |
||||||
|
|
||||||
|
/**
|
||||||
|
* Transposes the internal data and returns the corresponding new Array. |
||||||
|
* Its self assigning version is transpose. This is restricted to numerical types. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
Array transposed() const { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); |
||||||
|
Array<T> new_arr({mShape.rows(), mShape.cols()}); |
||||||
|
new_arr.eigenMap() = this->eigenMap().transpose().eval(); |
||||||
|
return new_arr; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Transposes the intenal data. Its self assigning version is transpose. |
||||||
|
* This is restricted to numerical types. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
void transpose() { |
||||||
|
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||||
|
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); |
||||||
|
Array<T> new_arr(*this, {mShape.cols(), mShape.rows()}); |
||||||
|
new_arr.eigenMap() = this->eigenMap().transpose().eval(); |
||||||
|
mShape = Shape({mShape.cols(), mShape.rows()}); |
||||||
|
}; |
||||||
|
|
||||||
|
void inverse() const { |
||||||
|
static_assert(IS_FLOAT(T), "Function only available on floating numeric types."); |
||||||
|
CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays"); |
||||||
|
CT_ERROR_IF(shape().rows(), !=, shape().cols(), |
||||||
|
"Inverse can only occur on square matrices"); |
||||||
|
Array<T> inv(shape()); |
||||||
|
inv.eigenMap() = this->eigenMap().inverse(); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Pins the memory (page locks) for faster memory transfer in concurrent |
||||||
|
* transfers. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
void pinMemory() const { CudaTools::pin(pHost, mShape.items() * sizeof(T)); }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates the host copy by copying the device data back to the host. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const { |
||||||
|
CT_ERROR(mIsView, "Cannot update host on a view"); |
||||||
|
CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream); |
||||||
|
return stream; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates the device copy by copying the host data to the device. |
||||||
|
* \brief Host only |
||||||
|
*/ |
||||||
|
StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const { |
||||||
|
CT_ERROR(mIsView, "Cannot update device on a view"); |
||||||
|
CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream); |
||||||
|
return stream; |
||||||
|
}; |
||||||
|
}; |
||||||
|
|
||||||
|
template <typename T> |
||||||
|
void printAxis(std::ostream& out, const Array<T>& arr, const uint32_t axis, size_t width) { |
||||||
|
std::string space = std::string(2 * axis, ' '); |
||||||
|
if (arr.shape().axes() == 1) { |
||||||
|
out << "["; |
||||||
|
for (uint32_t i = 0; i < arr.shape().items(); ++i) { |
||||||
|
if constexpr (std::is_floating_point<T>::value) { |
||||||
|
out << std::scientific << std::setprecision(6); |
||||||
|
} |
||||||
|
if (width == 0) { |
||||||
|
out << ((i == 0) ? "" : " "); |
||||||
|
} else { |
||||||
|
out << std::setw((i == 0) ? width - 1 : width); |
||||||
|
} |
||||||
|
out << (T)arr[i] << ((i == arr.shape().items() - 1) ? "]" : ","); |
||||||
|
} |
||||||
|
} else if (arr.shape().axes() == 2) { |
||||||
|
for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { |
||||||
|
out << space << ((i == 0) ? "[" : " "); |
||||||
|
printAxis(out, arr[i], axis + 1, width); |
||||||
|
out << ((i == arr.shape().dim(0) - 1) ? "]" : ",\n"); |
||||||
|
} |
||||||
|
} else { |
||||||
|
out << space << "[\n"; |
||||||
|
for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { |
||||||
|
printAxis(out, arr[i], axis + 1, width); |
||||||
|
out << ((i == arr.shape().dim(0) - 1) ? "\n" : ",\n\n"); |
||||||
|
} |
||||||
|
out << space << "]"; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>& arr) { |
||||||
|
size_t width = 0; |
||||||
|
if constexpr (IS_NUM(T)) { |
||||||
|
T max_val = 0; |
||||||
|
bool negative = false; |
||||||
|
for (auto it = arr.begin(); it != arr.end(); ++it) { |
||||||
|
if (*it < 0) negative = true; |
||||||
|
max_val = (abs(*it) > max_val) ? abs(*it) : max_val; |
||||||
|
} |
||||||
|
width = std::to_string(max_val).size() + 1; |
||||||
|
width += (negative) ? 1 : 0; |
||||||
|
} else if constexpr (IS_FLOAT(T)) { |
||||||
|
T max_val = 0; |
||||||
|
bool negative = false; |
||||||
|
for (auto it = arr.begin(); it != arr.end(); ++it) { |
||||||
|
if (*it < 0) negative = true; |
||||||
|
int exp = 0; |
||||||
|
frexp(*it, &exp); |
||||||
|
max_val = (exp > max_val) ? exp : max_val; |
||||||
|
} |
||||||
|
width = std::to_string(max_val).size() + 5; |
||||||
|
width += (negative) ? 1 : 0; |
||||||
|
} |
||||||
|
|
||||||
|
printAxis<T>(out, arr, 0, (arr.shape().axes() == 1) ? 0 : width); |
||||||
|
return out; |
||||||
|
} |
||||||
|
|
||||||
|
}; // namespace CudaTools
|
||||||
|
|
||||||
|
#endif // ARRAY_H
|
@ -0,0 +1,600 @@ |
|||||||
|
#ifndef BLAS_H |
||||||
|
#define BLAS_H |
||||||
|
|
||||||
|
#include "Array.h" |
||||||
|
#include "Core.h" |
||||||
|
#include "Macros.h" |
||||||
|
|
||||||
|
namespace CudaTools { |
||||||
|
|
||||||
|
namespace BLAS { |
||||||
|
|
||||||
|
struct BatchInfo { |
||||||
|
uint32_t strideA, strideB, strideC; |
||||||
|
uint32_t size; |
||||||
|
}; |
||||||
|
|
||||||
|
template <typename T> struct Check { |
||||||
|
static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") { |
||||||
|
CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str()); |
||||||
|
}; |
||||||
|
|
||||||
|
static void isSquare(const Array<T>& arr, const std::string& name = "Array") { |
||||||
|
isAtLeast2D(arr, name); |
||||||
|
CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str()) |
||||||
|
}; |
||||||
|
|
||||||
|
static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C, |
||||||
|
const std::string& nameA = "A", const std::string& nameB = "B", |
||||||
|
const std::string nameC = "C") { |
||||||
|
isAtLeast2D(A, nameA); |
||||||
|
isAtLeast2D(B, nameB); |
||||||
|
isAtLeast2D(C, nameB); |
||||||
|
CT_ERROR_IF(A.shape().cols(), !=, B.shape().rows(), |
||||||
|
(nameA + nameB + " is not a valid matrix multiplication").c_str()); |
||||||
|
|
||||||
|
Shape ABshape({A.shape().rows(), B.shape().cols()}); |
||||||
|
Shape Cshape({C.shape().rows(), C.shape().cols()}); |
||||||
|
|
||||||
|
CT_ERROR_IF( |
||||||
|
ABshape, !=, Cshape, |
||||||
|
("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str()); |
||||||
|
}; |
||||||
|
|
||||||
|
static uint32_t getUpperItems(const Array<T>& arr) { |
||||||
|
uint32_t upperItems = 1; |
||||||
|
for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) { |
||||||
|
upperItems *= arr.shape().dim(iAxis); |
||||||
|
} |
||||||
|
return upperItems; |
||||||
|
}; |
||||||
|
|
||||||
|
static void matchUpperShape(const Array<T>& A, const Array<T>& B, |
||||||
|
const std::string& nameA = "A", const std::string& nameB = "B") { |
||||||
|
CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(), |
||||||
|
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); |
||||||
|
for (uint32_t iAxis = 0; iAxis < A.shape().axes() - 2; ++iAxis) { |
||||||
|
uint32_t Adim = A.shape().dim(iAxis); |
||||||
|
uint32_t Bdim = B.shape().dim(iAxis); |
||||||
|
CT_ERROR_IF( |
||||||
|
Adim, !=, Bdim, |
||||||
|
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C, |
||||||
|
const std::string& nameA = "A", const std::string& nameB = "B", |
||||||
|
const std::string nameC = "C") { |
||||||
|
isValidMatmul(A, B, C, nameA, nameB, nameC); |
||||||
|
uint32_t itemsA = getUpperItems(A); |
||||||
|
uint32_t itemsB = getUpperItems(B); |
||||||
|
uint32_t itemsC = getUpperItems(C); |
||||||
|
|
||||||
|
uint32_t Asize = A.shape().rows() * A.shape().cols(); |
||||||
|
uint32_t Bsize = B.shape().rows() * B.shape().cols(); |
||||||
|
uint32_t Csize = C.shape().rows() * C.shape().cols(); |
||||||
|
|
||||||
|
if (itemsA == itemsB) { |
||||||
|
CT_ERROR_IF(itemsA, !=, itemsC, |
||||||
|
("Incorrect dimensions to broadcast to output " + nameC).c_str()); |
||||||
|
matchUpperShape(A, B, nameA, nameB); |
||||||
|
matchUpperShape(A, C, nameA, nameC); |
||||||
|
return BatchInfo{Asize, Bsize, Csize, itemsC}; |
||||||
|
} else if (itemsA > itemsB) { |
||||||
|
CT_ERROR_IF( |
||||||
|
itemsB, !=, 1, |
||||||
|
("Cannot broadcast operation to " + nameB + " with non-matching " + nameA).c_str()); |
||||||
|
CT_ERROR_IF(itemsA, !=, itemsC, |
||||||
|
("Incorrect dimensions to broadcast to output " + nameC).c_str()); |
||||||
|
matchUpperShape(A, C, nameA, nameC); |
||||||
|
return BatchInfo{Asize, 0, Csize, itemsC}; |
||||||
|
} else { |
||||||
|
CT_ERROR_IF( |
||||||
|
itemsA, !=, 1, |
||||||
|
("Cannot broadcast operation to " + nameA + " with non-matching " + nameB).c_str()); |
||||||
|
CT_ERROR_IF(itemsA, !=, itemsC, |
||||||
|
("Incorrect dimensions to broadcast to output " + nameC).c_str()); |
||||||
|
matchUpperShape(B, C, nameB, nameC); |
||||||
|
return BatchInfo{0, Bsize, Csize, itemsC}; |
||||||
|
} |
||||||
|
}; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a Batch of Arrays with the same shape. Mainly used for cuBLAS functions. |
||||||
|
*/ |
||||||
|
template <typename T> class Batch { |
||||||
|
protected: |
||||||
|
Array<T*> mBatch; |
||||||
|
Shape mShape; |
||||||
|
|
||||||
|
uint32_t mCount = 0; |
||||||
|
uint32_t mBatchSize; |
||||||
|
|
||||||
|
public: |
||||||
|
Batch() = delete; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a batch from a given size. |
||||||
|
*/ |
||||||
|
Batch(const uint32_t size) : mBatchSize(size){}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a batch from a non-view Array. |
||||||
|
*/ |
||||||
|
Batch(const Array<T>& arr) { |
||||||
|
CT_ERROR(arr.isView(), "Array cannot be a view"); |
||||||
|
mShape = Shape({arr.shape().rows(), arr.shape().cols()}); |
||||||
|
mBatchSize = mCount = Check<T>::getUpperItems(arr); |
||||||
|
|
||||||
|
mBatch = Array<T*>({mBatchSize}); |
||||||
|
|
||||||
|
Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()}); |
||||||
|
for (uint32_t i = 0; i < mBatchSize; ++i) { |
||||||
|
#ifdef CUDA |
||||||
|
mBatch[i] = batch[i].dataDevice(); |
||||||
|
#else |
||||||
|
mBatch[i] = batch[i].data(); |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
mBatch.updateDevice().wait(); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a matrix to the batch. Array must be a view. |
||||||
|
*/ |
||||||
|
void add(const Array<T>& arr) { |
||||||
|
CT_ERROR(not arr.isView(), "Cannot add non-view Arrays"); |
||||||
|
CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays"); |
||||||
|
#ifdef CUDA |
||||||
|
mBatch[mCount] = arr.dataDevice(); |
||||||
|
#else |
||||||
|
mBatch[mCount] = arr.data(); |
||||||
|
#endif |
||||||
|
if (mCount == 0) { |
||||||
|
mShape = arr.shape(); |
||||||
|
mBatchSize = mCount = Check<T>::getUpperItems(arr); |
||||||
|
} else { |
||||||
|
CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch"); |
||||||
|
} |
||||||
|
++mCount; |
||||||
|
|
||||||
|
if (mCount == mBatchSize) { |
||||||
|
mBatch.updateDevice().wait(); |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Indexing operator which returns a view of the Array in the Batch at the given index. |
||||||
|
*/ |
||||||
|
Array<T> operator[](const uint32_t index) const { |
||||||
|
CT_ERROR_IF(index, >=, mBatchSize, "Index exceeds batch size"); |
||||||
|
return Array<T>(mBatch[index], {mShape.rows(), mShape.cols()}); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the batch Array of pointers. |
||||||
|
*/ |
||||||
|
Array<T*> batch() const { return mBatch.view(); }; |
||||||
|
Shape shape() const { return mShape; } /**< Gets the shape of the matrices in the batch. */ |
||||||
|
uint32_t size() const { return mBatchSize; } /**< Gets the batch size.*/ |
||||||
|
bool full() const { return mBatchSize == mCount; }; /**< Gets if the batch is full. */ |
||||||
|
}; |
||||||
|
|
||||||
|
////////////////
|
||||||
|
// cuBLAS API //
|
||||||
|
////////////////
|
||||||
|
|
||||||
|
template <typename T, typename F1, typename F2, typename... Args> |
||||||
|
constexpr void invoke(F1 f1, F2 f2, Args&&... args) { |
||||||
|
if constexpr (std::is_same<T, float>::value) { |
||||||
|
CUBLAS_CHECK(f1(args...)); |
||||||
|
} else if constexpr (std::is_same<T, double>::value) { |
||||||
|
CUBLAS_CHECK(f2(args...)); |
||||||
|
} else { |
||||||
|
CT_ERROR(true, "BLAS functions are not callable with that type"); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the matrix-vector product: \f$ y = \alpha Ax + \beta y \f$. It will automatically |
||||||
|
* broadcast the operation if applicable. |
||||||
|
*/ |
||||||
|
template <typename T> |
||||||
|
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y, |
||||||
|
const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||||
|
|
||||||
|
BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y"); |
||||||
|
CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector"); |
||||||
|
CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector"); |
||||||
|
|
||||||
|
uint32_t rows = A.shape().rows(); |
||||||
|
uint32_t cols = A.shape().cols(); |
||||||
|
T a = alpha, b = beta; |
||||||
|
#ifdef CUDA |
||||||
|
CUBLAS_CHECK( |
||||||
|
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||||
|
if (bi.size == 1) { |
||||||
|
invoke<T>(cublasSgemv, cublasDgemv, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, |
||||||
|
&a, A.dataDevice(), rows, x.dataDevice(), 1, &b, y.dataDevice(), 1); |
||||||
|
|
||||||
|
} else { // Greater than 2, so broadcast.
|
||||||
|
invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, |
||||||
|
Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, &a, A.dataDevice(), rows, |
||||||
|
bi.strideA, x.dataDevice(), 1, bi.strideB, &b, y.dataDevice(), 1, bi.strideC, |
||||||
|
bi.size); |
||||||
|
} |
||||||
|
|
||||||
|
#else |
||||||
|
if (bi.size == 1) { |
||||||
|
y.eigenMap() = a * (A.eigenMap() * x.eigenMap()) + b * y.eigenMap(); |
||||||
|
} else { // Greater than 2, so broadcast.
|
||||||
|
#pragma omp parallel for |
||||||
|
for (uint32_t i = 0; i < bi.size; ++i) { |
||||||
|
auto Ai = Array<T>(A, {rows, cols}, i * bi.strideA).eigenMap(); |
||||||
|
auto xi = Array<T>(x, {cols, 1}, i * bi.strideB).eigenMap(); |
||||||
|
auto yi = Array<T>(y, {rows, 1}, i * bi.strideC).eigenMap(); |
||||||
|
yi = a * (Ai * xi) + b * yi; |
||||||
|
} |
||||||
|
} |
||||||
|
#endif |
||||||
|
return StreamID{stream}; |
||||||
|
} |
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically |
||||||
|
* broadcast the operation if applicable. |
||||||
|
*/ |
||||||
|
template <typename T> |
||||||
|
StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C, |
||||||
|
const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||||
|
|
||||||
|
BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C"); |
||||||
|
// A is m x k, B is k x n.
|
||||||
|
uint32_t m = A.shape().rows(); |
||||||
|
uint32_t k = A.shape().cols(); |
||||||
|
uint32_t n = B.shape().cols(); |
||||||
|
|
||||||
|
T a = alpha, b = beta; |
||||||
|
#ifdef CUDA |
||||||
|
CUBLAS_CHECK( |
||||||
|
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||||
|
if (bi.size == 1) { |
||||||
|
invoke<T>(cublasSgemm, cublasDgemm, Manager::get()->cublasHandle(), CUBLAS_OP_N, |
||||||
|
CUBLAS_OP_N, m, n, k, &a, A.dataDevice(), m, B.dataDevice(), k, &b, |
||||||
|
C.dataDevice(), m); |
||||||
|
|
||||||
|
} else { // Greater than 2, so broadcast.
|
||||||
|
invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, |
||||||
|
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &a, |
||||||
|
A.dataDevice(), m, bi.strideA, B.dataDevice(), k, bi.strideB, &b, C.dataDevice(), |
||||||
|
m, bi.strideC, bi.size); |
||||||
|
} |
||||||
|
|
||||||
|
#else |
||||||
|
if (bi.size == 1) { |
||||||
|
C.eigenMap() = a * (A.eigenMap() * B.eigenMap()) + b * C.eigenMap(); |
||||||
|
} else { // Greater than 2, so broadcast.
|
||||||
|
#pragma omp parallel for |
||||||
|
for (uint32_t i = 0; i < bi.size; ++i) { |
||||||
|
auto Ai = Array<T>(A, {m, k}, i * bi.strideA).eigenMap(); |
||||||
|
auto Bi = Array<T>(B, {k, n}, i * bi.strideB).eigenMap(); |
||||||
|
auto Ci = Array<T>(C, {m, n}, i * bi.strideC).eigenMap(); |
||||||
|
Ci = a * (Ai * Bi) + b * Ci; |
||||||
|
} |
||||||
|
} |
||||||
|
#endif |
||||||
|
return StreamID{stream}; |
||||||
|
} |
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the diagonal matrix multiplication: \f$ C = A\mathrm{diag}(X) \f$, or \f$ C = |
||||||
|
* \mathrm{diag}(X)A \f$ if left = true. |
||||||
|
*/ |
||||||
|
template <typename T> |
||||||
|
StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const bool left = false, |
||||||
|
const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||||
|
CT_ERROR_IF(X.shape().cols(), !=, 1, "'x' must be a column vector."); |
||||||
|
if (left) { |
||||||
|
CT_ERROR_IF(A.shape().rows(), !=, X.shape().rows(), |
||||||
|
"Rows of 'A' and length of 'x' need to match."); |
||||||
|
} else { |
||||||
|
CT_ERROR_IF(A.shape().cols(), !=, X.shape().rows(), |
||||||
|
"Columns of 'A' and length of 'x' need to match."); |
||||||
|
} |
||||||
|
CT_ERROR_IF(A.shape().rows(), !=, C.shape().rows(), |
||||||
|
"Rows of 'A' and rows() of 'C' need to match."); |
||||||
|
CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(), |
||||||
|
"Rows of 'A' and columns of 'C' need to match."); |
||||||
|
|
||||||
|
#ifdef CUDA |
||||||
|
uint32_t m = C.shape().rows(); |
||||||
|
uint32_t n = C.shape().cols(); |
||||||
|
auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; |
||||||
|
CUBLAS_CHECK( |
||||||
|
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||||
|
invoke<T>(cublasSdgmm, cublasDdgmm, Manager::get()->cublasHandle(), m, n, A.dataDevice(), |
||||||
|
A.shape().rows(), X.dataDevice(), 1, C.dataDevice(), m); |
||||||
|
#else |
||||||
|
if (left) { |
||||||
|
C.eigenMap() = X.eigenMap().asDiagonal() * A.eigenMap(); |
||||||
|
} else { |
||||||
|
C.eigenMap() = A.eigenMap() * X.eigenMap().asDiagonal(); |
||||||
|
} |
||||||
|
#endif |
||||||
|
return StreamID{stream}; |
||||||
|
} |
||||||
|
|
||||||
|
//////////////////////////////
|
||||||
|
// PLUArray Related Objects //
|
||||||
|
//////////////////////////////
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// PartialPivLU Wrapper //
|
||||||
|
///////////////////////////
|
||||||
|
|
||||||
|
// This class is just a workaround to use Eigen's internals directly.
|
||||||
|
template <typename T> class PartialPivLU; |
||||||
|
namespace internal { |
||||||
|
template <typename T> static Array<T> empty({1, 1}); |
||||||
|
template <typename T> static EigenMapMat<T> empty_map = empty<T>.eigenMap(); |
||||||
|
}; // namespace internal
|
||||||
|
|
||||||
|
template <typename T, ENABLE_IF(IS_FLOAT(T)) = true> class PLUArray; |
||||||
|
// This is a wrapper class for Eigen's class so we have more controlled access to
|
||||||
|
// the underlying data.
|
||||||
|
template <typename T> class PartialPivLU : public Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>> { |
||||||
|
private: |
||||||
|
using Base = Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>>; |
||||||
|
template <typename U, ENABLE_IF(IS_FLOAT(U))> friend class PLUArray; |
||||||
|
|
||||||
|
EigenMapMat<T> mMapLU; |
||||||
|
EigenMapMat<int32_t> mMapPivots; |
||||||
|
|
||||||
|
public: |
||||||
|
PartialPivLU() |
||||||
|
: Base(internal::empty_map<T>), mMapLU(internal::empty_map<T>), |
||||||
|
mMapPivots(internal::empty_map<int32_t>){}; |
||||||
|
|
||||||
|
void make(const Array<T>& lu, const Array<int32_t>& pivots) { |
||||||
|
|
||||||
|
new (&mMapLU) EigenMapMat<T>(lu.eigenMap()); |
||||||
|
new (&mMapPivots) EigenMapMat<int32_t>(pivots.atLeast2D().eigenMap()); |
||||||
|
|
||||||
|
new (&this->m_lu) decltype(Base::m_lu)(mMapLU.derived()); |
||||||
|
new (&this->m_p) decltype(Base::m_p)(mMapPivots.derived()); |
||||||
|
|
||||||
|
// new (&this->m_rowsTranspositions) decltype(Base::m_rowsTranspositions)(
|
||||||
|
// mMapPivots.derived());
|
||||||
|
|
||||||
|
this->m_l1_norm = 0; |
||||||
|
this->m_det_p = 0; |
||||||
|
this->m_isInitialized = true; |
||||||
|
}; |
||||||
|
}; |
||||||
|
|
||||||
|
namespace internal { |
||||||
|
// We only create one and copy-construct to avoid the re-initialization.
|
||||||
|
template <typename T> static PartialPivLU<T> BlankPPLU = PartialPivLU<T>(); |
||||||
|
}; // namespace internal
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class for storing the PLU decomposition an Array. This is restricted to floating point types. |
||||||
|
*/ |
||||||
|
template <typename T, ENABLE_IF(IS_FLOAT(T))> class PLUArray { |
||||||
|
private: |
||||||
|
Array<T> mLU; |
||||||
|
Array<int32_t> mPivots; |
||||||
|
PartialPivLU<T> mPPLU = internal::BlankPPLU<T>; |
||||||
|
|
||||||
|
public: |
||||||
|
PLUArray() = delete; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for a PLUArray given the matrix dimension. |
||||||
|
*/ |
||||||
|
PLUArray(const uint32_t n) : mLU({n, n}), mPivots({n}) { mPPLU.make(mLU, mPivots); }; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for a PLUArray given an existing array. |
||||||
|
*/ |
||||||
|
PLUArray(const Array<T>& arr) |
||||||
|
: mLU((arr.isView()) ? arr.view() : arr), mPivots({arr.shape().rows()}) { |
||||||
|
CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); |
||||||
|
CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); |
||||||
|
mPPLU.make(mLU, mPivots); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for a PLUArray given an existing location in memory for both the matrix and |
||||||
|
* the pivots. |
||||||
|
*/ |
||||||
|
PLUArray(const Array<T>& arr, const Array<int32_t> pivots) |
||||||
|
: mLU(arr.view()), mPivots(pivots.view()) { |
||||||
|
CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); |
||||||
|
CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); |
||||||
|
mPPLU.make(mLU, mPivots); |
||||||
|
}; |
||||||
|
|
||||||
|
uint32_t rank() { return mLU.shape().rows(); }; /**< Gets the rank of the LU matrix. */ |
||||||
|
Array<T> LU() const { return mLU.view(); }; /**< Gets the LU matrix. */ |
||||||
|
Array<int32_t> pivots() const { return mPivots.view(); }; /**< Gets the LU matrix. */ |
||||||
|
|
||||||
|
/**
|
||||||
|
* Comptues the inplace LU factorization for this array on CPU. |
||||||
|
*/ |
||||||
|
void computeLU() { |
||||||
|
mPPLU.compute(); |
||||||
|
mPPLU.mMapPivots = mPPLU.permutationP().indices(); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Solves the system \f$ LUx = b \f$ and returns \f$x\f$. |
||||||
|
*/ |
||||||
|
Array<T> solve(const Array<T>& b) { |
||||||
|
Array<T> x(b.shape()); |
||||||
|
x.eigenMap() = mPPLU.solve(b.eigenMap()); |
||||||
|
return x; |
||||||
|
}; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* This is a batch version of PLUArray, to enable usage of the cuBLAS API. This is restricted to |
||||||
|
* floating point types. |
||||||
|
*/ |
||||||
|
template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true> |
||||||
|
class PLUBatch : public Batch<T> { |
||||||
|
private: |
||||||
|
Array<int32_t> mPivotsBatch; |
||||||
|
Array<int32_t> mInfoLU; |
||||||
|
int32_t mInfoSolve; |
||||||
|
|
||||||
|
bool mInitialized = false; |
||||||
|
|
||||||
|
public: |
||||||
|
/**
|
||||||
|
* Constructor of a PLUBatch from a given batch size. |
||||||
|
*/ |
||||||
|
PLUBatch(const uint32_t size) : Batch<T>(size), mInfoLU({size}){}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions. |
||||||
|
*/ |
||||||
|
PLUBatch(const Array<T>& arr) : Batch<T>(arr) { |
||||||
|
Check<T>::isSquare(arr, "LU Array"); |
||||||
|
|
||||||
|
mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()}); |
||||||
|
mInfoLU = Array<int32_t>({this->mBatchSize}); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Indexing operator which returns the PLUArray in the PLUBatch at the given index. |
||||||
|
*/ |
||||||
|
PLUArray<T> operator[](const uint32_t index) const { |
||||||
|
CT_ERROR_IF(index, >=, this->mBatchSize, "Index exceeds batch size"); |
||||||
|
Array<T> lu(this->mBatch[index], {this->mShape.rows(), this->mShape.cols()}); |
||||||
|
Array<int32_t> pivots(mPivotsBatch.data() + index * this->mShape.rows(), |
||||||
|
{this->mShape.rows()}); |
||||||
|
return PLUArray<T>(lu, pivots); |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the inplace PLU decomposition of batch of arrays. |
||||||
|
*/ |
||||||
|
StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||||
|
#ifdef CUDA |
||||||
|
uint32_t n = this->mShape.rows(); |
||||||
|
CUBLAS_CHECK( |
||||||
|
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||||
|
invoke<T>(cublasSgetrfBatched, cublasDgetrfBatched, Manager::get()->cublasHandle(), n, |
||||||
|
this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), mInfoLU.dataDevice(), |
||||||
|
this->mBatchSize); |
||||||
|
|
||||||
|
#else |
||||||
|
#pragma omp parallel for |
||||||
|
for (uint32_t i = 0; i < this->mBatchSize; ++i) { |
||||||
|
(*this)[i].computeLU(); |
||||||
|
} |
||||||
|
#endif |
||||||
|
mInitialized = true; |
||||||
|
return stream; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Solves the batched system \f$LUx = b\f$ inplace. The solution \f$x\f$ is written back into |
||||||
|
* \f$b\f$. |
||||||
|
*/ |
||||||
|
StreamID solve(const Batch<T>& b, const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||||
|
CT_ERROR(not mInitialized, |
||||||
|
"Cannot solve system if PLUBatch has not yet computed its LU decomposition"); |
||||||
|
CT_ERROR_IF(b.size(), !=, this->mBatchSize, |
||||||
|
"Upper dimensions of b do not match batch size"); |
||||||
|
CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(), |
||||||
|
"The length of each column of b must match the matrix rank"); |
||||||
|
|
||||||
|
#ifdef CUDA |
||||||
|
uint32_t n = b.shape().rows(); |
||||||
|
uint32_t nrhs = b.shape().cols(); |
||||||
|
CUBLAS_CHECK( |
||||||
|
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||||
|
invoke<T>(cublasSgetrsBatched, cublasDgetrsBatched, Manager::get()->cublasHandle(), |
||||||
|
CUBLAS_OP_N, n, nrhs, this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), |
||||||
|
b.batch().dataDevice(), n, &mInfoSolve, this->mBatchSize); |
||||||
|
|
||||||
|
#else |
||||||
|
#pragma omp parallel for |
||||||
|
for (uint32_t i = 0; i < this->mBatchSize; ++i) { |
||||||
|
b[i] = (*this)[i].solve(b[i]); |
||||||
|
} |
||||||
|
#endif |
||||||
|
return stream; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the pivots data from the device to the host. Does nothing for CPU. |
||||||
|
*/ |
||||||
|
StreamID getPivots(const StreamID& stream = DEF_MEM_STREAM) const { |
||||||
|
mPivotsBatch.updateHost(stream); |
||||||
|
return stream; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the info array for the LU decomposition for the device to the host. Does not |
||||||
|
* return useful information for CPU. |
||||||
|
*/ |
||||||
|
Array<int32_t> getLUInfo() const { |
||||||
|
mInfoLU.updateHost().wait(); |
||||||
|
return mInfoLU; |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks validity of the solve operation. Does not return useful information for CPU. |
||||||
|
*/ |
||||||
|
int32_t validSolve() const { return mInfoSolve == 0; } |
||||||
|
}; |
||||||
|
|
||||||
|
// /**
|
||||||
|
// * Gets the inverse of each A[i], using an already PLU factorized A[i].
|
||||||
|
// * Only available if compiling with CUDA.
|
||||||
|
// */
|
||||||
|
// template <typename T>
|
||||||
|
// void inverseBatch(const Array<T*>& batchA, const Array<T*>& batchC, const Array<int>&
|
||||||
|
// pivots,
|
||||||
|
// const Array<int>& info, const Shape shapeA, const Shape shapeC,
|
||||||
|
// const uint stream = 0) {
|
||||||
|
// #ifdef CUDA
|
||||||
|
// CT_ERROR_IF(shapeA.rows(), !=, shapeA.cols(),
|
||||||
|
// "'A' needs to be square, rows() and column need to match.");
|
||||||
|
// CT_ERROR_IF(shapeA.rows(), !=, shapeC.cols(), "'A' needs to be the same shape as
|
||||||
|
// 'C'."); CT_ERROR_IF(shapeA.rows(), !=, shapeC.rows(), "'A' needs to be the same shape
|
||||||
|
// as 'C'.");
|
||||||
|
|
||||||
|
// CT_ERROR_IF(shapeA.rows(), !=, pivots.shape().rows(),
|
||||||
|
// "Rows()/columns of 'A' and rows() of pivots need to match.");
|
||||||
|
// CT_ERROR_IF(batchA.shape().rows(), !=, pivots.shape().cols(),
|
||||||
|
// "Batch size and columns of pivots need to match.");
|
||||||
|
// CT_ERROR_IF(info.shape().cols(), !=, 1, "Info needs to be a column vector.")
|
||||||
|
// CT_ERROR_IF(batchA.shape().rows(), !=, info.shape().rows(),
|
||||||
|
// "Batch size and length of info need to match.");
|
||||||
|
// CT_ERROR_IF(batchA.shape().rows(), !=, batchC.shape().rows(),
|
||||||
|
// "Batches 'A[i]' and 'C[i]' need to match.");
|
||||||
|
|
||||||
|
// std::string s = "cublas" + std::to_string(stream);
|
||||||
|
// CUBLAS_CHECK(
|
||||||
|
// cublasSetStream(Manager::get()->cublasHandle(),
|
||||||
|
// Manager::get()->stream(s)));
|
||||||
|
// invoke<T>(cublasSgetriBatched, cublasDgetriBatched,
|
||||||
|
// Manager::get()->cublasHandle(),
|
||||||
|
// shapeA.rows(), batchA.dataDevice(), shapeA.rows(), pivots.dataDevice(),
|
||||||
|
// batchC.dataDevice(), shapeC.rows(), info.dataDevice(),
|
||||||
|
// batchA.shape().rows());
|
||||||
|
// #else
|
||||||
|
// CT_ERROR_IF(true, ==, true, "inverseBatch is not callable without CUDA.");
|
||||||
|
// #endif
|
||||||
|
// }
|
||||||
|
|
||||||
|
}; // namespace BLAS
|
||||||
|
}; // namespace CudaTools
|
||||||
|
|
||||||
|
#endif |
@ -0,0 +1,544 @@ |
|||||||
|
#ifndef CUDATOOLS_H |
||||||
|
#define CUDATOOLS_H |
||||||
|
|
||||||
|
#include "Macros.h" |
||||||
|
#include <iostream> |
||||||
|
#include <string> |
||||||
|
#include <unordered_map> |
||||||
|
#include <vector> |
||||||
|
|
||||||
|
namespace CudaTools { |
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple wrapper for the name of a stream. Its purposes is to allow for |
||||||
|
* 'streams' to be passed on host code, and allowing for simple syntax |
||||||
|
* for waiting. |
||||||
|
*/ |
||||||
|
struct StreamID { |
||||||
|
public: |
||||||
|
std::string id; |
||||||
|
StreamID() : id(""){}; |
||||||
|
/**
|
||||||
|
* The constructor for a StreamID. |
||||||
|
*/ |
||||||
|
StreamID(const std::string& id_) : id(id_){}; |
||||||
|
StreamID(const char* id_) : id(id_){}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Waits for the stream with this stream ID. |
||||||
|
*/ |
||||||
|
void wait() const; |
||||||
|
}; |
||||||
|
|
||||||
|
static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"}; |
||||||
|
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"}; |
||||||
|
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Allocates memory on the device. |
||||||
|
*/ |
||||||
|
void* malloc(const size_t size); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Pins memory on the host. |
||||||
|
*/ |
||||||
|
void pin(void* const pHost, const size_t size); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Pushes memory from the device to the host. |
||||||
|
*/ |
||||||
|
StreamID push(void* const pHost, void* const pDevice, const size_t size, |
||||||
|
const StreamID& stream = DEF_MEM_STREAM); |
||||||
|
/**
|
||||||
|
* Pulls memory from the device back to the host. |
||||||
|
*/ |
||||||
|
StreamID pull(void* const pHost, void* const pDevice, const size_t size, |
||||||
|
const StreamID& stream = DEF_MEM_STREAM); |
||||||
|
/**
|
||||||
|
* Copies memory on the device to another location on the device. |
||||||
|
*/ |
||||||
|
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, |
||||||
|
const StreamID& stream = DEF_MEM_STREAM); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Frees memory on the device. |
||||||
|
*/ |
||||||
|
void free(void* const pDevice); |
||||||
|
|
||||||
|
#ifdef CUDACC |
||||||
|
cudaDeviceProp getDeviceProp(); |
||||||
|
static cudaDeviceProp DeviceProperties = getDeviceProp(); |
||||||
|
const char* cublasGetErrorString(cublasStatus_t status); |
||||||
|
#endif |
||||||
|
|
||||||
|
/**
|
||||||
|
* A class that manages various CUDA Runtime components, such as |
||||||
|
* streams, events, and handles. |
||||||
|
*/ |
||||||
|
class Manager { |
||||||
|
private: |
||||||
|
static Manager mManagerInstance; |
||||||
|
Manager(const std::vector<std::string>& names); |
||||||
|
~Manager(); |
||||||
|
#ifdef CUDACC |
||||||
|
std::unordered_map<std::string, cudaStream_t> mStreams; |
||||||
|
cublasHandle_t mCublas; |
||||||
|
#endif |
||||||
|
public: |
||||||
|
/**
|
||||||
|
* Used to get the global CudaTools::Manager instance. |
||||||
|
*/ |
||||||
|
static Manager* get() { return &mManagerInstance; }; |
||||||
|
|
||||||
|
void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */ |
||||||
|
void sync() const; /**< Waits until all device code has finished. */ |
||||||
|
void addStream(const std::string& name); /**< Creates a stream with the given name. */ |
||||||
|
#ifdef CUDACC |
||||||
|
cudaStream_t stream(const StreamID& stream) const; |
||||||
|
cublasHandle_t cublasHandle() const; |
||||||
|
#endif |
||||||
|
}; |
||||||
|
|
||||||
|
namespace Kernel { |
||||||
|
|
||||||
|
/**
|
||||||
|
* A struct that contains the kernel launch parameters. |
||||||
|
*/ |
||||||
|
struct Settings { |
||||||
|
public: |
||||||
|
#ifdef CUDACC |
||||||
|
dim3 blockGrid; |
||||||
|
dim3 threadBlock; |
||||||
|
size_t sharedMemoryBytes = 0; |
||||||
|
#else |
||||||
|
size_t threads; |
||||||
|
#endif |
||||||
|
StreamID stream; |
||||||
|
|
||||||
|
Settings() = default; |
||||||
|
|
||||||
|
void setGridDim(const size_t x); /**< Sets the Grid dimensions. */ |
||||||
|
void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */ |
||||||
|
void setGridDim(const size_t x, const size_t y, |
||||||
|
const size_t z); /**< Sets the Grid dimensions. */ |
||||||
|
void setBlockDim(const size_t x); /**< Sets the Thread Block dimensions. */ |
||||||
|
void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */ |
||||||
|
void setBlockDim(const size_t x, const size_t y, |
||||||
|
const size_t z); /**< Sets the Thread Block dimensions. */ |
||||||
|
|
||||||
|
void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */ |
||||||
|
void setStream(const StreamID& stream); /**< Sets the stream. */ |
||||||
|
}; |
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a kernel launch parameters based on the number of threads, and optionally |
||||||
|
* a stream. Should only be used for 'embarassingly parallel' situations, or where |
||||||
|
* each thread corresponds some sort of index. |
||||||
|
*/ |
||||||
|
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM); |
||||||
|
|
||||||
|
}; // namespace Kernel
|
||||||
|
|
||||||
|
template <typename T> class Array; |
||||||
|
|
||||||
|
/**
|
||||||
|
* A class that holds information about an Array. |
||||||
|
*/ |
||||||
|
class Shape { |
||||||
|
private: |
||||||
|
template <typename T> friend class Array; |
||||||
|
uint32_t mAxes; |
||||||
|
uint32_t mItems; |
||||||
|
uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0}; |
||||||
|
uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0}; |
||||||
|
|
||||||
|
public: |
||||||
|
HD Shape() : mAxes(0), mItems(1){}; |
||||||
|
/**
|
||||||
|
* The constructor for a Shape. |
||||||
|
* \param dims an initializer list of the dimensions. |
||||||
|
*/ |
||||||
|
HD Shape(const std::initializer_list<uint32_t> dims); |
||||||
|
|
||||||
|
HD uint32_t axes() const; /**< Gets the number of axes. */ |
||||||
|
HD uint32_t items() const; /**< Gets the total number of items. */ |
||||||
|
|
||||||
|
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
|
||||||
|
of the last axis. */ |
||||||
|
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
|
||||||
|
dimension of the second to last axis. */ |
||||||
|
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets the
|
||||||
|
dimension of the second to last axis. */ |
||||||
|
|
||||||
|
HD uint32_t |
||||||
|
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */ |
||||||
|
HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */ |
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the shape at a specific axis of this shape. |
||||||
|
* \param axis the axis of where the new shape starts. |
||||||
|
*/ |
||||||
|
HD Shape subshape(const uint32_t axis) const; |
||||||
|
|
||||||
|
HD bool operator==(const Shape& s) const; /**< Equals operator. */ |
||||||
|
HD bool operator!=(const Shape& s) const; /**< Not equals operator. */ |
||||||
|
}; |
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& out, const Shape& s); |
||||||
|
|
||||||
|
}; // namespace CudaTools
|
||||||
|
|
||||||
|
#ifdef CUDATOOLS_IMPLEMENTATION |
||||||
|
|
||||||
|
namespace CudaTools { |
||||||
|
|
||||||
|
template <typename T, typename... Args> |
||||||
|
StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) { |
||||||
|
#ifdef CUDA |
||||||
|
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes, |
||||||
|
Manager::get()->stream(sett.stream.id)>>>(args...); |
||||||
|
#else |
||||||
|
func(args...); |
||||||
|
#endif |
||||||
|
return sett.stream; |
||||||
|
} |
||||||
|
|
||||||
|
////////////////////
|
||||||
|
// Memory Methods //
|
||||||
|
////////////////////
|
||||||
|
|
||||||
|
void StreamID::wait() const { Manager::get()->waitFor(id); } |
||||||
|
|
||||||
|
void* malloc(const size_t size) { |
||||||
|
#ifdef CUDACC |
||||||
|
void* pDevice; |
||||||
|
CUDA_CHECK(cudaMalloc(&pDevice, size)); |
||||||
|
return pDevice; |
||||||
|
#else |
||||||
|
return nullptr; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void free(void* const pDevice) { |
||||||
|
#ifdef CUDACC |
||||||
|
if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice)); |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { |
||||||
|
#ifdef CUDACC |
||||||
|
CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice, |
||||||
|
Manager::get()->stream(stream.id))); |
||||||
|
#endif |
||||||
|
return stream; |
||||||
|
} |
||||||
|
|
||||||
|
StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { |
||||||
|
#ifdef CUDACC |
||||||
|
CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost, |
||||||
|
Manager::get()->stream(stream.id))); |
||||||
|
#endif |
||||||
|
return stream; |
||||||
|
} |
||||||
|
|
||||||
|
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, |
||||||
|
const StreamID& stream) { |
||||||
|
#ifdef CUDACC |
||||||
|
CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice, |
||||||
|
Manager::get()->stream(stream.id))); |
||||||
|
#endif |
||||||
|
return stream; |
||||||
|
} |
||||||
|
|
||||||
|
void pin(void* const pHost, const size_t size) { |
||||||
|
#ifdef CUDACC |
||||||
|
CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault)); |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
#ifdef CUDACC |
||||||
|
cudaDeviceProp getDeviceProp() { |
||||||
|
cudaSetDevice(0); |
||||||
|
cudaDeviceProp deviceProp; |
||||||
|
cudaGetDeviceProperties(&deviceProp, 0); |
||||||
|
return deviceProp; |
||||||
|
} |
||||||
|
#endif |
||||||
|
|
||||||
|
/////////////////////
|
||||||
|
// Manager Methods //
|
||||||
|
/////////////////////
|
||||||
|
|
||||||
|
Manager::Manager(const std::vector<std::string>& names) { |
||||||
|
#ifdef CUDACC |
||||||
|
for (auto name : names) { |
||||||
|
addStream(name); |
||||||
|
} |
||||||
|
CUBLAS_CHECK(cublasCreate(&mCublas)); |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
Manager::~Manager() { |
||||||
|
#ifdef CUDACC |
||||||
|
for (auto& it : mStreams) { |
||||||
|
CUDA_CHECK(cudaStreamDestroy(it.second)); |
||||||
|
} |
||||||
|
CUBLAS_CHECK(cublasDestroy(mCublas)); |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Manager::waitFor(const StreamID& stream) const { |
||||||
|
#ifdef CUDACC |
||||||
|
auto it = mStreams.find(stream.id); |
||||||
|
if (it != mStreams.end()) { |
||||||
|
CUDA_CHECK(cudaStreamSynchronize(it->second)); |
||||||
|
} else { |
||||||
|
CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); |
||||||
|
} |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Manager::sync() const { |
||||||
|
#ifdef CUDACC |
||||||
|
CUDA_CHECK(cudaDeviceSynchronize()); |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Manager::addStream(const std::string& name) { |
||||||
|
#ifdef CUDACC |
||||||
|
cudaStream_t s; |
||||||
|
CUDA_CHECK(cudaStreamCreate(&s)); |
||||||
|
mStreams[name] = s; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
#ifdef CUDACC |
||||||
|
cudaStream_t Manager::stream(const StreamID& stream) const { |
||||||
|
auto it = mStreams.find(stream.id); |
||||||
|
if (it != mStreams.end()) { |
||||||
|
return it->second; |
||||||
|
} else { |
||||||
|
CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
cublasHandle_t Manager::cublasHandle() const { return mCublas; }; |
||||||
|
|
||||||
|
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"}); |
||||||
|
#else |
||||||
|
Manager Manager::mManagerInstance = Manager({""}); |
||||||
|
#endif |
||||||
|
|
||||||
|
////////////////////
|
||||||
|
// Kernel Methods //
|
||||||
|
////////////////////
|
||||||
|
|
||||||
|
namespace Kernel { |
||||||
|
|
||||||
|
void Settings::setGridDim(const size_t x) { |
||||||
|
#ifdef CUDACC |
||||||
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large") |
||||||
|
blockGrid.x = x; |
||||||
|
blockGrid.y = 1; |
||||||
|
blockGrid.z = 1; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Settings::setGridDim(const size_t x, const size_t y) { |
||||||
|
#ifdef CUDACC |
||||||
|
CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); |
||||||
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); |
||||||
|
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); |
||||||
|
blockGrid.x = x; |
||||||
|
blockGrid.y = y; |
||||||
|
blockGrid.z = 1; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Settings::setGridDim(const size_t x, const size_t y, const size_t z) { |
||||||
|
#ifdef CUDACC |
||||||
|
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); |
||||||
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); |
||||||
|
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); |
||||||
|
CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large."); |
||||||
|
blockGrid.x = x; |
||||||
|
blockGrid.y = y; |
||||||
|
blockGrid.z = z; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Settings::setBlockDim(const size_t x) { |
||||||
|
#ifdef CUDACC |
||||||
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); |
||||||
|
threadBlock.x = x; |
||||||
|
threadBlock.y = 1; |
||||||
|
threadBlock.z = 1; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Settings::setBlockDim(const size_t x, const size_t y) { |
||||||
|
#ifdef CUDACC |
||||||
|
CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); |
||||||
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); |
||||||
|
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); |
||||||
|
threadBlock.x = x; |
||||||
|
threadBlock.y = y; |
||||||
|
threadBlock.z = 1; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) { |
||||||
|
#ifdef CUDACC |
||||||
|
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); |
||||||
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); |
||||||
|
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); |
||||||
|
CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large."); |
||||||
|
threadBlock.x = x; |
||||||
|
threadBlock.y = y; |
||||||
|
threadBlock.z = z; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Settings::setSharedMemSize(const size_t bytes) { |
||||||
|
#ifdef CUDACC |
||||||
|
sharedMemoryBytes = bytes; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
void Settings::setStream(const StreamID& stream_) { |
||||||
|
#ifdef CUDACC |
||||||
|
stream.id = stream_.id; |
||||||
|
#endif |
||||||
|
} |
||||||
|
|
||||||
|
Settings basic(const size_t threads, const StreamID& stream) { |
||||||
|
Settings sett; |
||||||
|
#ifdef CUDACC |
||||||
|
auto max_threads = DeviceProperties.maxThreadsPerBlock; |
||||||
|
size_t grid_blocks = (threads + max_threads - 1) / max_threads; // ceil(threads / max_threads)
|
||||||
|
size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
|
||||||
|
sett.setGridDim(grid_blocks); |
||||||
|
sett.setBlockDim(block_threads); |
||||||
|
sett.setStream(stream); |
||||||
|
#else |
||||||
|
sett.threads = threads; |
||||||
|
#endif |
||||||
|
return sett; |
||||||
|
} |
||||||
|
} // namespace Kernel
|
||||||
|
|
||||||
|
/////////////////////
|
||||||
|
// Shape Functions //
|
||||||
|
/////////////////////
|
||||||
|
|
||||||
|
HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) { |
||||||
|
CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes"); |
||||||
|
mAxes = dims.size(); |
||||||
|
if (mAxes == 0) return; |
||||||
|
|
||||||
|
auto it = dims.end() - 1; |
||||||
|
mItems = 1; |
||||||
|
for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) { |
||||||
|
uint32_t dim = *it; |
||||||
|
CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0"); |
||||||
|
|
||||||
|
mAxisDim[iAxis] = dim; |
||||||
|
mStride[iAxis] = mItems; |
||||||
|
mItems *= dim; |
||||||
|
--it; |
||||||
|
} |
||||||
|
|
||||||
|
if (mAxes == 1) return; |
||||||
|
// Swap last two, for column major storage.
|
||||||
|
mStride[mAxes - 2] = 1; |
||||||
|
mStride[mAxes - 1] = mAxisDim[mAxes - 2]; |
||||||
|
} |
||||||
|
|
||||||
|
HD uint32_t Shape::axes() const { return mAxes; }; |
||||||
|
HD uint32_t Shape::items() const { return mItems; }; |
||||||
|
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; } |
||||||
|
|
||||||
|
HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; } |
||||||
|
|
||||||
|
HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; } |
||||||
|
|
||||||
|
HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; } |
||||||
|
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; } |
||||||
|
|
||||||
|
HD bool Shape::operator==(const Shape& s) const { |
||||||
|
if (mAxes != s.mAxes) { |
||||||
|
return false; |
||||||
|
} |
||||||
|
for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) { |
||||||
|
if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) { |
||||||
|
return false; |
||||||
|
} |
||||||
|
} |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); } |
||||||
|
|
||||||
|
HD Shape Shape::subshape(const uint32_t axis) const { |
||||||
|
CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes."); |
||||||
|
if (axis == mAxes) return Shape({1}); |
||||||
|
|
||||||
|
Shape new_shape({}); |
||||||
|
new_shape.mAxes = mAxes - axis; |
||||||
|
new_shape.mItems = mItems; |
||||||
|
|
||||||
|
for (uint32_t iAxis = 0; iAxis < axis; iAxis++) { |
||||||
|
new_shape.mItems /= mAxisDim[iAxis]; |
||||||
|
} |
||||||
|
for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) { |
||||||
|
new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis]; |
||||||
|
new_shape.mStride[iAxis - axis] = mStride[iAxis]; |
||||||
|
} |
||||||
|
return new_shape; |
||||||
|
} |
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& out, const Shape& s) { |
||||||
|
out << "("; |
||||||
|
if (s.axes() == 0) return out << ")"; |
||||||
|
for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) { |
||||||
|
out << s.dim(iAxis) << ", "; |
||||||
|
} |
||||||
|
return out << s.dim(s.axes() - 1) << ")"; |
||||||
|
} |
||||||
|
|
||||||
|
#ifdef CUDACC |
||||||
|
const char* cublasGetErrorString(cublasStatus_t error) { |
||||||
|
switch (error) { |
||||||
|
case CUBLAS_STATUS_SUCCESS: |
||||||
|
return "CUBLAS_STATUS_SUCCESS"; |
||||||
|
|
||||||
|
case CUBLAS_STATUS_NOT_INITIALIZED: |
||||||
|
return "CUBLAS_STATUS_NOT_INITIALIZED"; |
||||||
|
|
||||||
|
case CUBLAS_STATUS_ALLOC_FAILED: |
||||||
|
return "CUBLAS_STATUS_ALLOC_FAILED"; |
||||||
|
|
||||||
|
case CUBLAS_STATUS_INVALID_VALUE: |
||||||
|
return "CUBLAS_STATUS_INVALID_VALUE"; |
||||||
|
|
||||||
|
case CUBLAS_STATUS_ARCH_MISMATCH: |
||||||
|
return "CUBLAS_STATUS_ARCH_MISMATCH"; |
||||||
|
|
||||||
|
case CUBLAS_STATUS_MAPPING_ERROR: |
||||||
|
return "CUBLAS_STATUS_MAPPING_ERROR"; |
||||||
|
|
||||||
|
case CUBLAS_STATUS_EXECUTION_FAILED: |
||||||
|
return "CUBLAS_STATUS_EXECUTION_FAILED"; |
||||||
|
|
||||||
|
case CUBLAS_STATUS_INTERNAL_ERROR: |
||||||
|
return "CUBLAS_STATUS_INTERNAL_ERROR"; |
||||||
|
} |
||||||
|
|
||||||
|
return "<unknown>"; |
||||||
|
} |
||||||
|
#endif |
||||||
|
|
||||||
|
}; // namespace CudaTools
|
||||||
|
#endif // CUDATOOLS_IMPLEMENTATION
|
||||||
|
|
||||||
|
#endif // CUDATOOLS_H
|
@ -0,0 +1,297 @@ |
|||||||
|
#ifndef MACROS_H |
||||||
|
#define MACROS_H |
||||||
|
|
||||||
|
#include <exception> |
||||||
|
#include <sstream> |
||||||
|
#include <stdarg.h> |
||||||
|
|
||||||
|
#if defined(CUDA) && defined(__CUDACC__) |
||||||
|
#define CUDACC |
||||||
|
#endif |
||||||
|
|
||||||
|
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) |
||||||
|
#define DEVICE |
||||||
|
#endif |
||||||
|
|
||||||
|
#ifdef CUDATOOLS_DOXYGEN |
||||||
|
/**
|
||||||
|
* \def CUDACC |
||||||
|
* This macro is defined when this code is being compiled by nvcc and the CUDA compilation |
||||||
|
* flag is set. This should be used to enclose code where CUDA specific libraries and syntax are |
||||||
|
* being used. |
||||||
|
*/ |
||||||
|
#define CUDACC |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def DEVICE |
||||||
|
* This macro is defined when this code is being compiled for the device. The difference between |
||||||
|
* this and CUDACC is that this should exclusively be used to dcide if code is being compiled |
||||||
|
* to execute on the device. CUDACC is only determines what compiler is being used. |
||||||
|
*/ |
||||||
|
#define DEVICE |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def HD |
||||||
|
* Mark a function in front with this if it needs to be callable on both the |
||||||
|
* CPU and CUDA device. |
||||||
|
*/ |
||||||
|
#define HD |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def SHARED |
||||||
|
* Mark a variable as static shared memory. |
||||||
|
*/ |
||||||
|
#define SHARED |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def DECLARE_KERNEL(call, ...) |
||||||
|
* Used to declare (in header) a CUDA kernel. |
||||||
|
* \param call the name of the kernel |
||||||
|
* \param ... the arguments of the kernel |
||||||
|
*/ |
||||||
|
#define DECLARE_KERNEL(call, ...) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def DEFINE_KERNEL(call, ...) |
||||||
|
* Used to define (in implementation) a CUDA kernel. |
||||||
|
* \param call the name of the kernel |
||||||
|
* \param ... the arguments of the kernel |
||||||
|
*/ |
||||||
|
#define DEFINE_KERNEL(call, ...) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def KERNEL(call, settings, ...) |
||||||
|
* Used to call a CUDA kernel. |
||||||
|
* \param call the name of the kernel |
||||||
|
* \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with |
||||||
|
* \param ... the arguments of the kernel |
||||||
|
*/ |
||||||
|
#define KERNEL(call, settings, ...) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def BASIC_LOOP(N) |
||||||
|
* Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly |
||||||
|
* parallel situations. Exposes the loop/thread number as iThread. |
||||||
|
* \param N number of iterations |
||||||
|
*/ |
||||||
|
#define BASIC_LOOP(N) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def DEVICE_CLASS(name) |
||||||
|
* Can be used inside a class declaration (header) which generates boilerplate code to allow this |
||||||
|
* class to be used on the device. |
||||||
|
* |
||||||
|
* This macro creates a few functions:\n |
||||||
|
* name* that(): returns the pointer to this instance on the device. |
||||||
|
* |
||||||
|
* void allocateDevice(): allocates the memory on the device for this class instance. |
||||||
|
* |
||||||
|
* CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance |
||||||
|
* of the class. |
||||||
|
* |
||||||
|
* CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates |
||||||
|
* the device instance of the class. |
||||||
|
* \param name the name of the class |
||||||
|
*/ |
||||||
|
#define DEVICE_CLASS(name) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def CT_ERROR_IF(a, op, b, msg) |
||||||
|
* Used for throwing runtime errors given a condition with an operator. |
||||||
|
*/ |
||||||
|
#define CT_ERROR_IF(a, op, b, msg) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def CT_ERROR(a, msg) |
||||||
|
* Used for throwing runtime errors given a bool. |
||||||
|
*/ |
||||||
|
#define CT_ERROR(a, msg) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def CUDA_CHECK(call) |
||||||
|
* Gets the error generated by a CUDA function call if there is one. |
||||||
|
* \param call CUDA function to check if there are errors when running. |
||||||
|
*/ |
||||||
|
#define CUDA_CHECK(call) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def CUBLAS_CHECK(call) |
||||||
|
* Gets the error generated by a cuBLAS function call if there is one. |
||||||
|
* \param call cuBLAS function to check if there are errors when running. |
||||||
|
*/ |
||||||
|
#define CUBLAS_CHECK(call) |
||||||
|
|
||||||
|
/**
|
||||||
|
* \def CUDA_MEM(call) |
||||||
|
* Gets the GPU memory used from function call if there is one. |
||||||
|
* \param call function to measure memory usage. |
||||||
|
* \param name an identifier to use as a variable and when printing. Must satisfy variable naming. |
||||||
|
*/ |
||||||
|
#define CUDA_MEM(call, name) |
||||||
|
#endif |
||||||
|
|
||||||
|
///////////////////
|
||||||
|
// KERNEL MACROS //
|
||||||
|
///////////////////
|
||||||
|
|
||||||
|
#ifdef CUDACC |
||||||
|
|
||||||
|
#include <cublas_v2.h> |
||||||
|
#include <cuda_runtime.h> |
||||||
|
|
||||||
|
#define HD __host__ __device__ |
||||||
|
#define SHARED __shared__ |
||||||
|
|
||||||
|
#define DECLARE_KERNEL(call, ...) __global__ void call(__VA_ARGS__) |
||||||
|
|
||||||
|
#define DEFINE_KERNEL(call, ...) \ |
||||||
|
template CudaTools::StreamID CudaTools::runKernel( \
|
||||||
|
void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__); \
|
||||||
|
__global__ void call(__VA_ARGS__) |
||||||
|
|
||||||
|
#else |
||||||
|
#define HD |
||||||
|
#define SHARED |
||||||
|
|
||||||
|
#define DECLARE_KERNEL(call, ...) void call(__VA_ARGS__) |
||||||
|
|
||||||
|
#define DEFINE_KERNEL(call, ...) \ |
||||||
|
template CudaTools::StreamID CudaTools::runKernel( \
|
||||||
|
void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__); \
|
||||||
|
void call(__VA_ARGS__) |
||||||
|
|
||||||
|
#endif // CUDACC
|
||||||
|
|
||||||
|
#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__) |
||||||
|
|
||||||
|
///////////////////
|
||||||
|
// DEVICE MACROS //
|
||||||
|
///////////////////
|
||||||
|
|
||||||
|
#ifdef DEVICE |
||||||
|
|
||||||
|
#define BASIC_LOOP(N) \ |
||||||
|
uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x; \
|
||||||
|
if (iThread < N) |
||||||
|
#else |
||||||
|
#define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread) |
||||||
|
|
||||||
|
#endif |
||||||
|
|
||||||
|
//////////////////
|
||||||
|
// CLASS MACROS //
|
||||||
|
//////////////////
|
||||||
|
|
||||||
|
#define UPDATE_FUNC(name) \ |
||||||
|
inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \
|
||||||
|
CudaTools::DEF_MEM_STREAM) { \
|
||||||
|
return CudaTools::pull(this, that(), sizeof(name)); \
|
||||||
|
}; \
|
||||||
|
inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \
|
||||||
|
CudaTools::DEF_MEM_STREAM) { \
|
||||||
|
return CudaTools::push(this, that(), sizeof(name)); \
|
||||||
|
} |
||||||
|
|
||||||
|
#ifdef CUDA |
||||||
|
|
||||||
|
#define DEVICE_CLASS(name) \ |
||||||
|
private: \
|
||||||
|
name* __deviceInstance__ = nullptr; \
|
||||||
|
\
|
||||||
|
public: \
|
||||||
|
inline name* that() { return __deviceInstance__; } \
|
||||||
|
inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \
|
||||||
|
UPDATE_FUNC(name) |
||||||
|
|
||||||
|
#else |
||||||
|
#define DEVICE_CLASS(name) \ |
||||||
|
public: \
|
||||||
|
inline name* that() { return this; }; \
|
||||||
|
inline void allocateDevice(){}; \
|
||||||
|
UPDATE_FUNC(name) |
||||||
|
|
||||||
|
#endif |
||||||
|
|
||||||
|
#ifndef CUDATOOLS_ARRAY_MAX_AXES |
||||||
|
/**
|
||||||
|
* \def CUDATOOLS_ARRAY_MAX_AXES |
||||||
|
* The maximum number of axes/dimensions an CudaTools::Array can have. The default is |
||||||
|
* set to 4, but can be manully set fit the program needs. |
||||||
|
*/ |
||||||
|
#define CUDATOOLS_ARRAY_MAX_AXES 4 |
||||||
|
#endif |
||||||
|
|
||||||
|
////////////////////
|
||||||
|
// Error Checking //
|
||||||
|
////////////////////
|
||||||
|
|
||||||
|
#ifndef NO_DIMENSION_CHECK |
||||||
|
#ifdef DEVICE |
||||||
|
#define CT_ERROR_IF(a, op, b, msg) \ |
||||||
|
if (a op b) { \
|
||||||
|
printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg); \
|
||||||
|
} |
||||||
|
|
||||||
|
#define CT_ERROR(a, msg) \ |
||||||
|
if (a) { \
|
||||||
|
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||||
|
} |
||||||
|
#else |
||||||
|
|
||||||
|
#define CT_ERROR_IF(a, op, b, msg) \ |
||||||
|
if (a op b) { \
|
||||||
|
std::ostringstream os_a; \
|
||||||
|
std::ostringstream os_b; \
|
||||||
|
os_a << a; \
|
||||||
|
os_b << b; \
|
||||||
|
printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg, \
|
||||||
|
os_a.str().c_str(), os_b.str().c_str()); \
|
||||||
|
throw std::exception(); \
|
||||||
|
} |
||||||
|
|
||||||
|
#define CT_ERROR(a, msg) \ |
||||||
|
if (a) { \
|
||||||
|
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||||
|
throw std::exception(); \
|
||||||
|
} |
||||||
|
#endif |
||||||
|
|
||||||
|
#endif // NO_DIMENSION_CHECK
|
||||||
|
|
||||||
|
#if defined(CUDACC) && !defined(NO_CUDA_CHECK) |
||||||
|
|
||||||
|
#define CUDA_CHECK(call) \ |
||||||
|
do { \
|
||||||
|
cudaError_t err = (call); \
|
||||||
|
if (err != cudaSuccess) { \
|
||||||
|
printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||||
|
throw std::exception(); \
|
||||||
|
} \
|
||||||
|
} while (0) |
||||||
|
|
||||||
|
#define CUBLAS_CHECK(call) \ |
||||||
|
do { \
|
||||||
|
cublasStatus_t err = (call); \
|
||||||
|
if (err != CUBLAS_STATUS_SUCCESS) { \
|
||||||
|
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \
|
||||||
|
CudaTools::cublasGetErrorString(err)); \
|
||||||
|
throw std::exception(); \
|
||||||
|
} \
|
||||||
|
} while (0) |
||||||
|
|
||||||
|
#define CUDA_MEM(call, name) \ |
||||||
|
size_t free_bef_##name, free_aft_##name; \
|
||||||
|
cudaMemGetInfo(&free_bef_##name, NULL); \
|
||||||
|
call; \
|
||||||
|
CudaTools::Manager::get()->sync(); \
|
||||||
|
cudaMemGetInfo(&free_aft_##name, NULL); \
|
||||||
|
printf("[%s] GPU Memory Usage: %iMiB\n", #name, \
|
||||||
|
(free_bef_##name - free_aft_##name) / (1024 * 1024)); |
||||||
|
|
||||||
|
#else |
||||||
|
#define CUDA_CHECK(call) (call) |
||||||
|
#define CUBLAS_CHECK(call) (call) |
||||||
|
#define CUDA_MEM(call, name) (call) |
||||||
|
#endif |
||||||
|
|
||||||
|
#endif // MACROS_H
|
@ -0,0 +1,95 @@ |
|||||||
|
CC := g++-10
|
||||||
|
NVCC := nvcc
|
||||||
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
|
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||||
|
|
||||||
|
INCLUDE :=
|
||||||
|
LIBS_DIR :=
|
||||||
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
|
LIBS :=
|
||||||
|
LIBS_GPU := cuda cudart cublas
|
||||||
|
|
||||||
|
TARGET = tests
|
||||||
|
SRC_DIR = .
|
||||||
|
BUILD_DIR = build
|
||||||
|
|
||||||
|
# Should not need to modify below.
|
||||||
|
|
||||||
|
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||||
|
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||||
|
|
||||||
|
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||||
|
|
||||||
|
# Get source files and object files.
|
||||||
|
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||||
|
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||||
|
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
|
||||||
|
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||||
|
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||||
|
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
|
||||||
|
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||||
|
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||||
|
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||||
|
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||||
|
|
||||||
|
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||||
|
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||||
|
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||||
|
|
||||||
|
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||||
|
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||||
|
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||||
|
|
||||||
|
INC := $(INCLUDE:%=-I%)
|
||||||
|
LIB := $(LIBS_DIR:%=-L%)
|
||||||
|
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||||
|
LD := $(LIBS:%=-l%)
|
||||||
|
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||||
|
|
||||||
|
# Reminder:
|
||||||
|
# $< = first prerequisite
|
||||||
|
# $@ = the target which matched the rule
|
||||||
|
# $^ = all prerequisites
|
||||||
|
|
||||||
|
.PHONY: all clean |
||||||
|
|
||||||
|
all : cpu gpu |
||||||
|
|
||||||
|
cpu: $(TARGET)CPU |
||||||
|
gpu: $(TARGET)GPU |
||||||
|
|
||||||
|
$(TARGET)CPU: $(CPU_OBJ) |
||||||
|
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||||
|
|
||||||
|
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||||
|
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||||
|
# regular ones. Then, we link them all together.
|
||||||
|
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||||
|
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||||
|
$(NVCC) --device-link $^ -o $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||||
|
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||||
|
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
-include $(CPU_DEPS) |
||||||
|
-include $(GPU_DEPS) |
||||||
|
|
||||||
|
$(CPU_BUILD_DIR): |
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR): |
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
clean: |
||||||
|
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,40 @@ |
|||||||
|
========= |
||||||
|
CudaTools |
||||||
|
========= |
||||||
|
This is the documentation for CudaTools, a header-only library and framework |
||||||
|
for the development of CPU-CUDA compatible applications. Using CudaTools enables |
||||||
|
the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to |
||||||
|
introduce ``#ifdef`` statements when code is essentially identical between the targets. |
||||||
|
|
||||||
|
For information on the library itself and its usage, view `documentation <https://acem.ece.illinois.edu/code/CudaTools>`__. The small code snippets and samples |
||||||
|
seen in the documentation are in the folder ``samples``. |
||||||
|
|
||||||
|
Dependencies |
||||||
|
============ |
||||||
|
- Eigen |
||||||
|
|
||||||
|
In the future, we will make this dependency optional, but still provide support |
||||||
|
for it. As of now, it is necessary. |
||||||
|
|
||||||
|
Building the Documentation |
||||||
|
========================== |
||||||
|
The documentation is built with `Doxygen <https://doxygen.nl/>`__ and `Sphinx <https://www.sphinx-doc.org/en>`__. |
||||||
|
So, first make sure you have Doxygen installed on your system, and make sure it is added |
||||||
|
to your system path. Then, you will have to create a Python virtual environment |
||||||
|
in the repository folder |
||||||
|
|
||||||
|
.. code-block:: bash |
||||||
|
|
||||||
|
$ python3 -m venv .venv |
||||||
|
|
||||||
|
After installing the required Python packages |
||||||
|
|
||||||
|
.. code-block:: bash |
||||||
|
|
||||||
|
$ pip install -r requirements |
||||||
|
|
||||||
|
you can now run the script |
||||||
|
|
||||||
|
.. code-block:: bash |
||||||
|
|
||||||
|
$ ./build_docs |
@ -0,0 +1,2 @@ |
|||||||
|
doxygen docs/Doxyfile |
||||||
|
sphinx-build -b html docs/source docs/build/html |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,20 @@ |
|||||||
|
# Minimal makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line, and also
|
||||||
|
# from the environment for the first two.
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = source
|
||||||
|
BUILDDIR = build
|
||||||
|
|
||||||
|
# Put it first so that "make" without argument is like "make help".
|
||||||
|
help: |
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile |
||||||
|
|
||||||
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
|
%: Makefile |
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
@ -0,0 +1,35 @@ |
|||||||
|
@ECHO OFF |
||||||
|
|
||||||
|
pushd %~dp0 |
||||||
|
|
||||||
|
REM Command file for Sphinx documentation |
||||||
|
|
||||||
|
if "%SPHINXBUILD%" == "" ( |
||||||
|
set SPHINXBUILD=sphinx-build |
||||||
|
) |
||||||
|
set SOURCEDIR=source |
||||||
|
set BUILDDIR=build |
||||||
|
|
||||||
|
if "%1" == "" goto help |
||||||
|
|
||||||
|
%SPHINXBUILD% >NUL 2>NUL |
||||||
|
if errorlevel 9009 ( |
||||||
|
echo. |
||||||
|
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx |
||||||
|
echo.installed, then set the SPHINXBUILD environment variable to point |
||||||
|
echo.to the full path of the 'sphinx-build' executable. Alternatively you |
||||||
|
echo.may add the Sphinx directory to PATH. |
||||||
|
echo. |
||||||
|
echo.If you don't have Sphinx installed, grab it from |
||||||
|
echo.http://sphinx-doc.org/ |
||||||
|
exit /b 1 |
||||||
|
) |
||||||
|
|
||||||
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% |
||||||
|
goto end |
||||||
|
|
||||||
|
:help |
||||||
|
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% |
||||||
|
|
||||||
|
:end |
||||||
|
popd |
@ -0,0 +1,39 @@ |
|||||||
|
sections = { |
||||||
|
"mesh_prep": 1, |
||||||
|
"matrix_assembly": 2, |
||||||
|
"bc_calc": 3, |
||||||
|
"timestep": 4, |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
window.MathJax = { |
||||||
|
loader: {load: ['[tex]/tagformat', '[tex]/ams']}, |
||||||
|
tex: { |
||||||
|
packages: {'[+]': ['tagformat', 'ams']}, |
||||||
|
macros: { |
||||||
|
dd: "{\\, \\mathrm{d}}", |
||||||
|
E: "{\\mathbf{E}}", |
||||||
|
H: "{\\mathbf{H}}", |
||||||
|
J: "{\\mathbf{J}}", |
||||||
|
D: "{\\mathbf{D}}", |
||||||
|
B: "{\\mathbf{B}}", |
||||||
|
M: "{\\mathbf{M}}", |
||||||
|
tbE: "{\\tilde{\\E}}", |
||||||
|
tbH: "{\\tilde{\\H}}", |
||||||
|
tE: "{\\tilde{E}}", |
||||||
|
tH: "{\\tilde{H}}", |
||||||
|
tphi: "{\\tilde{\\phi}}", |
||||||
|
curl: ["{\\nabla \\times {#1}}", 1], |
||||||
|
div: ["{\\nabla \\cdot {#1}}", 1], |
||||||
|
tens: ["{\\bar{\\bar{{#1}}}}", 1], |
||||||
|
}, |
||||||
|
tags: 'ams', |
||||||
|
tagformat: { |
||||||
|
number: (n) => sections[window.location.pathname.split("/").pop().split(".")[0]] + '.' + n, |
||||||
|
}, |
||||||
|
ams: { |
||||||
|
multilineWidth: '100%', |
||||||
|
multilineIndent: '50em' |
||||||
|
} |
||||||
|
}, |
||||||
|
} |
@ -0,0 +1,26 @@ |
|||||||
|
======= |
||||||
|
Array.h |
||||||
|
======= |
||||||
|
|
||||||
|
The ``Array.h`` header file contains the Array class, and its related classes. For this |
||||||
|
file only, assume that every functions is callable on both host and device unless |
||||||
|
explicitly mentioned otherwise. |
||||||
|
|
||||||
|
CudaTools::Shape |
||||||
|
---------------- |
||||||
|
.. doxygenclass:: CudaTools::Shape |
||||||
|
:members: |
||||||
|
:allow-dot-graphs: |
||||||
|
|
||||||
|
CudaTools::ArrayIterator<T> |
||||||
|
--------------------------- |
||||||
|
.. doxygenclass:: CudaTools::ArrayIterator |
||||||
|
:members: |
||||||
|
:allow-dot-graphs: |
||||||
|
|
||||||
|
CudaTools::Array<T> |
||||||
|
------------------- |
||||||
|
.. doxygenclass:: CudaTools::Array |
||||||
|
:members: |
||||||
|
:private-members: |
||||||
|
:allow-dot-graphs: |
@ -0,0 +1,45 @@ |
|||||||
|
====== |
||||||
|
BLAS.h |
||||||
|
====== |
||||||
|
|
||||||
|
The ``BLAS.h`` header file contains some BLAS functions, and some related |
||||||
|
classes for those functions. |
||||||
|
|
||||||
|
BLAS Functions |
||||||
|
============== |
||||||
|
Currently, these are the supported BLAS functions. They are inherited mainly |
||||||
|
from the cuBLAS API, and condensed into a unified functions. The plan is to |
||||||
|
add them as necessary. |
||||||
|
|
||||||
|
CudaTools::BLAS::GEMV<T> |
||||||
|
------------------------ |
||||||
|
.. doxygenfunction:: CudaTools::BLAS::GEMV |
||||||
|
|
||||||
|
CudaTools::BLAS::GEMM<T> |
||||||
|
------------------------ |
||||||
|
.. doxygenfunction:: CudaTools::BLAS::GEMM |
||||||
|
|
||||||
|
CudaTools::BLAS::DGMM<T> |
||||||
|
------------------------ |
||||||
|
.. doxygenfunction:: CudaTools::BLAS::DGMM |
||||||
|
|
||||||
|
BLAS Classes |
||||||
|
============ |
||||||
|
|
||||||
|
These classes also inherit functions from the cuBLAS API, but are packaged |
||||||
|
into classes that are more intuitive and hide external details. |
||||||
|
|
||||||
|
CudaTools::BLAS::Batch<T> |
||||||
|
------------------------- |
||||||
|
.. doxygenclass:: CudaTools::BLAS::Batch |
||||||
|
:members: |
||||||
|
|
||||||
|
CudaTools::BLAS::PLUArray<T> |
||||||
|
---------------------------- |
||||||
|
.. doxygenclass:: CudaTools::BLAS::PLUArray |
||||||
|
:members: |
||||||
|
|
||||||
|
CudaTools::BLAS::PLUBatch<T> |
||||||
|
---------------------------- |
||||||
|
.. doxygenclass:: CudaTools::BLAS::PLUBatch |
||||||
|
:members: |
@ -0,0 +1,53 @@ |
|||||||
|
# Configuration file for the Sphinx documentation builder. |
||||||
|
|
||||||
|
# -- Project information |
||||||
|
|
||||||
|
project = 'DGEMS' |
||||||
|
copyright = '2022' |
||||||
|
author = 'Kenneth Jao, Qi Jian Lim' |
||||||
|
|
||||||
|
release = '0.1' |
||||||
|
version = '0.1.0' |
||||||
|
|
||||||
|
# -- General configuration |
||||||
|
|
||||||
|
html_static_path = ["_static"] |
||||||
|
html_js_files = ["js/mathjax-config.js"] |
||||||
|
|
||||||
|
extensions = [ |
||||||
|
'sphinx.ext.duration', |
||||||
|
'sphinx.ext.doctest', |
||||||
|
'sphinx.ext.autodoc', |
||||||
|
'sphinx.ext.autosummary', |
||||||
|
'sphinx.ext.autosectionlabel', |
||||||
|
'sphinx.ext.intersphinx', |
||||||
|
'sphinx.ext.mathjax', |
||||||
|
'sphinx.ext.graphviz', |
||||||
|
'sphinxcontrib.bibtex', |
||||||
|
'breathe', |
||||||
|
] |
||||||
|
|
||||||
|
breathe_projects = {"DGEMS": "../build/xml"} |
||||||
|
breathe_default_project = "DGEMS" |
||||||
|
|
||||||
|
bibtex_bibfiles = ['refs.bib'] |
||||||
|
|
||||||
|
mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" |
||||||
|
|
||||||
|
intersphinx_mapping = { |
||||||
|
'python': ('https://docs.python.org/3/', None), |
||||||
|
'sphinx': ('https://www.sphinx-doc.org/en/master/', None), |
||||||
|
} |
||||||
|
intersphinx_disabled_domains = ['std'] |
||||||
|
|
||||||
|
templates_path = ['_templates'] |
||||||
|
|
||||||
|
# -- Options for HTML output |
||||||
|
|
||||||
|
html_theme = 'sphinx_rtd_theme' |
||||||
|
html_theme_options = { |
||||||
|
'collapse_navigation': False, |
||||||
|
} |
||||||
|
|
||||||
|
# -- Options for EPUB output |
||||||
|
epub_show_urls = 'footnote' |
@ -0,0 +1,67 @@ |
|||||||
|
====== |
||||||
|
Core.h |
||||||
|
====== |
||||||
|
|
||||||
|
The ``Core.h`` header file defines several compiler flags and macros along with |
||||||
|
a few core classes. |
||||||
|
|
||||||
|
Flags |
||||||
|
===== |
||||||
|
|
||||||
|
Device Indicators |
||||||
|
----------------- |
||||||
|
.. doxygendefine:: CUDACC |
||||||
|
.. doxygendefine:: DEVICE |
||||||
|
|
||||||
|
Host-Device Automation |
||||||
|
---------------------- |
||||||
|
.. doxygendefine:: HD |
||||||
|
.. doxygendefine:: SHARED |
||||||
|
|
||||||
|
Compilation Options |
||||||
|
------------------- |
||||||
|
.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES |
||||||
|
|
||||||
|
Macros |
||||||
|
====== |
||||||
|
|
||||||
|
Kernel |
||||||
|
------ |
||||||
|
.. doxygendefine:: DECLARE_KERNEL |
||||||
|
.. doxygendefine:: DEFINE_KERNEL |
||||||
|
.. doxygendefine:: KERNEL |
||||||
|
|
||||||
|
Device Helpers |
||||||
|
-------------- |
||||||
|
|
||||||
|
.. doxygendefine:: BASIC_LOOP |
||||||
|
|
||||||
|
Device Class |
||||||
|
------------ |
||||||
|
|
||||||
|
.. doxygendefine:: DEVICE_CLASS |
||||||
|
|
||||||
|
|
||||||
|
Classes and Structs |
||||||
|
=================== |
||||||
|
|
||||||
|
CudaTools::StreamID |
||||||
|
------------------- |
||||||
|
|
||||||
|
.. doxygenstruct:: CudaTools::StreamID |
||||||
|
|
||||||
|
CudaTools::Manager |
||||||
|
------------------ |
||||||
|
|
||||||
|
.. doxygenclass:: CudaTools::Manager |
||||||
|
:members: |
||||||
|
|
||||||
|
CudaTools::Kernel::Settings |
||||||
|
--------------------------- |
||||||
|
|
||||||
|
.. doxygenstruct:: CudaTools::Kernel::Settings |
||||||
|
:members: |
||||||
|
|
||||||
|
CudaTools::Kernel::Basic |
||||||
|
------------------------ |
||||||
|
.. doxygenfunction:: CudaTools::Kernel::basic |
@ -0,0 +1,25 @@ |
|||||||
|
========= |
||||||
|
CudaTools |
||||||
|
========= |
||||||
|
This is the documentation for CudaTools, a header-only library and framework |
||||||
|
for the development of CPU-CUDA compatible applications. Using CudaTools enables |
||||||
|
the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to |
||||||
|
introduce ``#ifdef`` statements when code is essentially identical between the targets. |
||||||
|
|
||||||
|
To get started, please head over to the :doc:`usage` section. For more detail on the |
||||||
|
machinery underneath, please refer to the other other sections. |
||||||
|
|
||||||
|
.. note:: |
||||||
|
|
||||||
|
If you would like to contribute, please visit the `git page <https://git.acem.ece.illinois.edu/kjao/CudaTools>`__. |
||||||
|
|
||||||
|
Contents |
||||||
|
======== |
||||||
|
|
||||||
|
.. toctree:: |
||||||
|
:maxdepth: 2 |
||||||
|
|
||||||
|
usage |
||||||
|
core |
||||||
|
array |
||||||
|
blas |
@ -0,0 +1,128 @@ |
|||||||
|
================== |
||||||
|
Usage and Examples |
||||||
|
================== |
||||||
|
|
||||||
|
|
||||||
|
This library is broken up into three main parts, as well as a certain |
||||||
|
compilation and linking framework: |
||||||
|
|
||||||
|
#. :ref:`Core Examples` |
||||||
|
#. :ref:`Array Examples` |
||||||
|
#. :ref:`BLAS Examples` |
||||||
|
#. :ref:`Compilation and Linking` |
||||||
|
|
||||||
|
The ``Core.h`` header contains the necessary macros, flags and objects for interfacing with |
||||||
|
basic kernel launching and the CUDA Runtime API. The ``Array.h`` header contains the ``CudaTools::Array`` |
||||||
|
class which provides a device compatible Array-like class with easy memory management. Lastly, |
||||||
|
the ``BLAS.h`` header provides functions BLAS functions through the the cuBLAS library for the GPU, |
||||||
|
and Eigen for the CPU. Lastly, a templated Makefile is provided which can be used |
||||||
|
for your own project, after following a few rules. |
||||||
|
|
||||||
|
The usage of this libary will be illustrated through examples, and further details |
||||||
|
can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder. |
||||||
|
Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers |
||||||
|
to a function runnable on the CPU, and a device function refers to a function that is runnable |
||||||
|
on a device. A kernel is a specific function that the host can call to be run on the device. |
||||||
|
|
||||||
|
Core Examples |
||||||
|
============= |
||||||
|
This file mainly introduces compiler macros and a few classes that are used to improve the |
||||||
|
syntax between host and device code. To define and call a kernel, there are a few |
||||||
|
macros provided. For example, |
||||||
|
|
||||||
|
.. code-block:: cpp |
||||||
|
|
||||||
|
DEFINE_KERNEL(add, int x, int y) { |
||||||
|
printf("Kernel: %i\n", x + y); |
||||||
|
} |
||||||
|
|
||||||
|
int main() { |
||||||
|
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2. |
||||||
|
return 0; |
||||||
|
} |
||||||
|
|
||||||
|
The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments. |
||||||
|
The second argument in the ``KERNEL()`` macro is are the launch parameters for |
||||||
|
kernel. The launch parameters have several items, but for 'embarassingly parallel' |
||||||
|
cases, we can simply generate the settings with the number of threads. More detail with |
||||||
|
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example, |
||||||
|
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail, |
||||||
|
see :ref:`here <Macros>`. |
||||||
|
|
||||||
|
.. warning:: |
||||||
|
These kernel definitions must be in a file that will be compiled by ``nvcc``. Also, |
||||||
|
for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it |
||||||
|
and make it available to other files. |
||||||
|
|
||||||
|
Since many applications used classes, a macro is provided to 'convert' a class into |
||||||
|
being device-compatible. Following the previous example similarly, |
||||||
|
|
||||||
|
.. code-block:: cpp |
||||||
|
|
||||||
|
class intPair { |
||||||
|
DEVICE_CLASS(intPair) |
||||||
|
public: |
||||||
|
int x, y; |
||||||
|
|
||||||
|
intPair(const int x_, const int y_) : x(x_), y(y_) { |
||||||
|
allocateDevice(); // Allocates memory for this intPair on the device. |
||||||
|
updateDevice().wait(); // Copies the memory on the host to the device and waits until finished. |
||||||
|
}; |
||||||
|
|
||||||
|
HD void swap() { |
||||||
|
int swap = x; |
||||||
|
x = y; |
||||||
|
y = swap; |
||||||
|
}; |
||||||
|
}; |
||||||
|
|
||||||
|
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); } |
||||||
|
|
||||||
|
int main() { |
||||||
|
intPair pair(1, 2); |
||||||
|
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2. |
||||||
|
|
||||||
|
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait(); |
||||||
|
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished. |
||||||
|
|
||||||
|
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1. |
||||||
|
return 0; |
||||||
|
} |
||||||
|
|
||||||
|
In this example, we create a class called ``intPair``, which is then made available on the device through |
||||||
|
the ``DEVICE_CLASS(name)`` macro. Specifically, that macro introduces a few functions, like |
||||||
|
``allocateDevice()``, ``updateDevice()``, ``updateHost()``, and ``that()``. That last function |
||||||
|
returns a pointer to the copy on the device. For more details, see :ref:`here <Device Class>`. If we were to pass in the host pointer of the ``intPair`` to the kernel, there would be a illegal memory access. |
||||||
|
|
||||||
|
The kernel argument list should **must** consist of pointers to objects, or a non-reference object. |
||||||
|
Otherwise, compilation will fail. In general this is safer, as it forces the programmer to |
||||||
|
acknowledge that the device copy is being passed. For the latter case of a non-reference object, |
||||||
|
you should only do this if there is no issue in creating a copy of the original object. In the above |
||||||
|
example, we could have done this, but for more complicated classes it may result in unwanted behavior. |
||||||
|
|
||||||
|
Lastly, since the point of classes is usually to have some member functions, to have them |
||||||
|
available on the device, you must mark them with the compiler macro ``HD`` in front. |
||||||
|
|
||||||
|
We also introduce the ``wait()`` function, which waits for the command to complete before |
||||||
|
continuing. Most calls that involve the device are asynchronous, so without proper blocking, |
||||||
|
operations dependent on a previous command are not guaranteed to run correctly. If the code is |
||||||
|
compiled for CPU, then everything will run synchronously, as per usual. |
||||||
|
|
||||||
|
.. note:: |
||||||
|
Almost all functions that are asynchronous provide an optional 'stream' argument, |
||||||
|
where you can give the name of the stream you wish to use. Different streams run |
||||||
|
asynchronous, but operations on the same stream are FIFO. To define a stream to use |
||||||
|
later, you must call ``CudaTools::Manager::get()->addStream("myStream")`` at some point |
||||||
|
before you use it. For more details, see :ref:`here <CudaTools::Manager>`. |
||||||
|
|
||||||
|
|
||||||
|
Array Examples |
||||||
|
============== |
||||||
|
|
||||||
|
|
||||||
|
BLAS Examples |
||||||
|
============= |
||||||
|
|
||||||
|
|
||||||
|
Compilation and Linking |
||||||
|
======================= |
@ -0,0 +1,4 @@ |
|||||||
|
Sphinx>=5.1.1 |
||||||
|
sphinx-rtd-theme>=1.0.0 |
||||||
|
sphinxcontrib-bibtex>=2.5.0 |
||||||
|
breathe>=4.34.0 |
@ -0,0 +1,95 @@ |
|||||||
|
CC := g++-10
|
||||||
|
NVCC := nvcc
|
||||||
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
|
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||||
|
|
||||||
|
INCLUDE := ../../
|
||||||
|
LIBS_DIR :=
|
||||||
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
|
LIBS :=
|
||||||
|
LIBS_GPU := cuda cudart cublas
|
||||||
|
|
||||||
|
TARGET = coreKernel
|
||||||
|
SRC_DIR = .
|
||||||
|
BUILD_DIR = build
|
||||||
|
|
||||||
|
# Should not need to modify below.
|
||||||
|
|
||||||
|
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||||
|
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||||
|
|
||||||
|
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||||
|
|
||||||
|
# Get source files and object files.
|
||||||
|
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||||
|
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||||
|
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
|
||||||
|
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||||
|
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||||
|
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
|
||||||
|
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||||
|
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||||
|
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||||
|
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||||
|
|
||||||
|
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||||
|
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||||
|
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||||
|
|
||||||
|
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||||
|
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||||
|
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||||
|
|
||||||
|
INC := $(INCLUDE:%=-I%)
|
||||||
|
LIB := $(LIBS_DIR:%=-L%)
|
||||||
|
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||||
|
LD := $(LIBS:%=-l%)
|
||||||
|
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||||
|
|
||||||
|
# Reminder:
|
||||||
|
# $< = first prerequisite
|
||||||
|
# $@ = the target which matched the rule
|
||||||
|
# $^ = all prerequisites
|
||||||
|
|
||||||
|
.PHONY: all clean |
||||||
|
|
||||||
|
all : cpu gpu |
||||||
|
|
||||||
|
cpu: $(TARGET)CPU |
||||||
|
gpu: $(TARGET)GPU |
||||||
|
|
||||||
|
$(TARGET)CPU: $(CPU_OBJ) |
||||||
|
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||||
|
|
||||||
|
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||||
|
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||||
|
# regular ones. Then, we link them all together.
|
||||||
|
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||||
|
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||||
|
$(NVCC) --device-link $^ -o $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||||
|
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||||
|
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
-include $(CPU_DEPS) |
||||||
|
-include $(GPU_DEPS) |
||||||
|
|
||||||
|
$(CPU_BUILD_DIR): |
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR): |
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
clean: |
||||||
|
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,12 @@ |
|||||||
|
#define CUDATOOLS_IMPLEMENTATION |
||||||
|
#include <Core.h> |
||||||
|
|
||||||
|
DEFINE_KERNEL(add, int x, int y) { |
||||||
|
printf("Kernel: %i\n", x + y); |
||||||
|
} |
||||||
|
|
||||||
|
int main() { |
||||||
|
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||||
|
return 0; |
||||||
|
} |
||||||
|
|
@ -0,0 +1,95 @@ |
|||||||
|
CC := g++-10
|
||||||
|
NVCC := nvcc
|
||||||
|
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||||
|
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||||
|
|
||||||
|
INCLUDE := ../../
|
||||||
|
LIBS_DIR :=
|
||||||
|
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||||
|
LIBS :=
|
||||||
|
LIBS_GPU := cuda cudart cublas
|
||||||
|
|
||||||
|
TARGET = coreClass
|
||||||
|
SRC_DIR = .
|
||||||
|
BUILD_DIR = build
|
||||||
|
|
||||||
|
# Should not need to modify below.
|
||||||
|
|
||||||
|
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||||
|
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||||
|
|
||||||
|
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||||
|
|
||||||
|
# Get source files and object files.
|
||||||
|
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||||
|
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||||
|
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||||
|
|
||||||
|
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||||
|
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||||
|
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||||
|
|
||||||
|
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||||
|
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||||
|
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||||
|
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||||
|
|
||||||
|
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||||
|
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||||
|
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||||
|
|
||||||
|
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||||
|
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||||
|
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||||
|
|
||||||
|
INC := $(INCLUDE:%=-I%)
|
||||||
|
LIB := $(LIBS_DIR:%=-L%)
|
||||||
|
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||||
|
LD := $(LIBS:%=-l%)
|
||||||
|
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||||
|
|
||||||
|
# Reminder:
|
||||||
|
# $< = first prerequisite
|
||||||
|
# $@ = the target which matched the rule
|
||||||
|
# $^ = all prerequisites
|
||||||
|
|
||||||
|
.PHONY: all clean |
||||||
|
|
||||||
|
all : cpu gpu |
||||||
|
|
||||||
|
cpu: $(TARGET)CPU |
||||||
|
gpu: $(TARGET)GPU |
||||||
|
|
||||||
|
$(TARGET)CPU: $(CPU_OBJ) |
||||||
|
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||||
|
|
||||||
|
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||||
|
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||||
|
# regular ones. Then, we link them all together.
|
||||||
|
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||||
|
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||||
|
$(NVCC) --device-link $^ -o $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||||
|
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||||
|
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||||
|
|
||||||
|
-include $(CPU_DEPS) |
||||||
|
-include $(GPU_DEPS) |
||||||
|
|
||||||
|
$(CPU_BUILD_DIR): |
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
$(GPU_BUILD_DIR): |
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
clean: |
||||||
|
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,34 @@ |
|||||||
|
#define CUDATOOLS_IMPLEMENTATION |
||||||
|
#include <Core.h> |
||||||
|
|
||||||
|
class intPair { |
||||||
|
DEVICE_CLASS(intPair) |
||||||
|
public: |
||||||
|
int x, y; |
||||||
|
|
||||||
|
intPair(const int x_, const int y_) : x(x_), y(y_) { |
||||||
|
allocateDevice(); // Allocates memory for this intPair on the device.
|
||||||
|
updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
|
||||||
|
}; |
||||||
|
|
||||||
|
HD void swap() { |
||||||
|
int swap = x; |
||||||
|
x = y; |
||||||
|
y = swap; |
||||||
|
}; |
||||||
|
}; |
||||||
|
|
||||||
|
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); } |
||||||
|
|
||||||
|
int main() { |
||||||
|
intPair pair(1, 2); |
||||||
|
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
||||||
|
|
||||||
|
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait(); |
||||||
|
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
|
||||||
|
|
||||||
|
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
||||||
|
return 0; |
||||||
|
} |
||||||
|
|
||||||
|
|
@ -0,0 +1,494 @@ |
|||||||
|
#define CUDATOOLS_IMPLEMENTATION |
||||||
|
#define CUDATOOLS_ARRAY_MAX_AXES 8 |
||||||
|
#include "Array.h" |
||||||
|
#include "BLAS.h" |
||||||
|
#include "Core.h" |
||||||
|
|
||||||
|
#include <Eigen/Core> |
||||||
|
#include <chrono> |
||||||
|
#include <complex> |
||||||
|
|
||||||
|
namespace CT = CudaTools; |
||||||
|
|
||||||
|
/////////////
|
||||||
|
// Helpers //
|
||||||
|
/////////////
|
||||||
|
|
||||||
|
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now() |
||||||
|
|
||||||
|
#define TIME_END(name) \ |
||||||
|
auto end_##name = std::chrono::steady_clock::now(); \
|
||||||
|
auto time_ms_##name = \
|
||||||
|
std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count(); \
|
||||||
|
auto time_mus_##name = \
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count(); \
|
||||||
|
if (time_ms_##name == 0) { \
|
||||||
|
printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \
|
||||||
|
} else { \
|
||||||
|
printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \
|
||||||
|
} |
||||||
|
|
||||||
|
#define TIME(call, name) \ |
||||||
|
TIME_START(name); \
|
||||||
|
call; \
|
||||||
|
TIME_END(name); |
||||||
|
|
||||||
|
#define TEST(predicate, name, msg) \ |
||||||
|
failed += (predicate) ? 0 : 1; \
|
||||||
|
printf("[%s] ", (predicate) ? "\033[1;32mPASS\033[0m" : "\033[1;31mFAIL\033[0m"); \
|
||||||
|
printf("%s | %s.\n", name, msg); |
||||||
|
|
||||||
|
template <typename T> struct Type; |
||||||
|
|
||||||
|
#define REGISTER_PARSE_TYPE(X) \ |
||||||
|
template <> struct Type<X> { static const std::string name; }; \
|
||||||
|
const std::string Type<X>::name = #X |
||||||
|
|
||||||
|
REGISTER_PARSE_TYPE(uint8_t); |
||||||
|
REGISTER_PARSE_TYPE(int16_t); |
||||||
|
REGISTER_PARSE_TYPE(int32_t); |
||||||
|
REGISTER_PARSE_TYPE(float); |
||||||
|
REGISTER_PARSE_TYPE(double); |
||||||
|
|
||||||
|
std::string box(std::string str) { |
||||||
|
std::string tops(str.size() + 6, '#'); |
||||||
|
return tops + "\n## " + str + " ##\n" + tops; |
||||||
|
} |
||||||
|
|
||||||
|
std::string box2(std::string str) { |
||||||
|
std::string tops(str.size() - 5, '-'); |
||||||
|
return tops + "\n|| " + str + " ||\n" + tops; |
||||||
|
} |
||||||
|
|
||||||
|
std::string boxSmall(std::string str) { |
||||||
|
std::string tops(6, '-'); |
||||||
|
return tops + "[ " + str + " ]" + tops; |
||||||
|
} |
||||||
|
|
||||||
|
std::string separator() { |
||||||
|
std::string line(40, '='); |
||||||
|
return "\n" + line + "\n"; |
||||||
|
} |
||||||
|
|
||||||
|
template <typename T> std::string type() { return "\033[1;96m" + Type<T>::name + "\033[0m"; } |
||||||
|
|
||||||
|
CT::Shape makeRandom2DShape() { |
||||||
|
std::random_device rd; |
||||||
|
std::mt19937 mt(rd()); |
||||||
|
std::uniform_int_distribution<uint32_t> dist(1, 15); |
||||||
|
return CT::Shape({dist(mt), dist(mt)}); |
||||||
|
} |
||||||
|
|
||||||
|
///////////
|
||||||
|
// Tests //
|
||||||
|
///////////
|
||||||
|
|
||||||
|
class TestClass { |
||||||
|
DEVICE_CLASS(TestClass); |
||||||
|
|
||||||
|
public: |
||||||
|
int x; |
||||||
|
TestClass(const int x) : x(x) { |
||||||
|
allocateDevice(); |
||||||
|
updateDevice().wait(); |
||||||
|
}; |
||||||
|
}; |
||||||
|
|
||||||
|
DEFINE_KERNEL(times, const CT::Array<int> arr) { |
||||||
|
BASIC_LOOP(arr.shape().length()) { arr[iThread] *= 2; } |
||||||
|
} |
||||||
|
|
||||||
|
DEFINE_KERNEL(classTest, TestClass* const test) { test->x = 100; } |
||||||
|
|
||||||
|
struct MacroTests { |
||||||
|
static uint32_t Kernel() { |
||||||
|
uint32_t failed = 0; |
||||||
|
CT::Array<int> A = CT::Array<int>::constant({10}, 1); |
||||||
|
A.updateDevice().wait(); |
||||||
|
KERNEL(times, CT::Kernel::basic(A.shape().items()), A.view()).wait(); |
||||||
|
A.updateHost().wait(); |
||||||
|
|
||||||
|
uint32_t errors = 0; |
||||||
|
for (auto it = A.begin(); it != A.end(); ++it) { |
||||||
|
if (*it != 2) ++errors; |
||||||
|
} |
||||||
|
|
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Errors: " << errors; |
||||||
|
TEST(errors == 0, "Kernel", msg.str().c_str()); |
||||||
|
return failed; |
||||||
|
}; |
||||||
|
|
||||||
|
static uint32_t Class() { |
||||||
|
uint32_t failed = 0; |
||||||
|
TestClass test(1); |
||||||
|
KERNEL(classTest, CT::Kernel::basic(1), test.that()).wait(); |
||||||
|
test.updateHost().wait(); |
||||||
|
|
||||||
|
TEST(test.x == 100, "Class", "Errors: 0"); |
||||||
|
return failed; |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
template <typename T> struct ArrayTests { |
||||||
|
static uint32_t Indexing() { |
||||||
|
uint32_t failed = 0; |
||||||
|
CT::Array<T> A = CT::Array<T>::range(0, 240); |
||||||
|
A.reshape({5, 3, 1, 4, 2, 1, 1, 2}); |
||||||
|
|
||||||
|
uint32_t errors = 0; |
||||||
|
for (uint32_t i = 0; i < 5; ++i) { |
||||||
|
for (uint32_t j = 0; j < 3; ++j) { |
||||||
|
for (uint32_t k = 0; k < 4; ++k) { |
||||||
|
for (uint32_t l = 0; l < 2; ++l) { |
||||||
|
for (uint32_t m = 0; m < 2; ++m) { |
||||||
|
if ((T)A[i][j][0][k][l][0][0][m] != (T)A[{i, j, 0, k, l, 0, 0, m}]) { |
||||||
|
++errors; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Errors: " << errors; |
||||||
|
TEST(errors == 0, "Element", msg.str().c_str()); |
||||||
|
|
||||||
|
errors = 0; |
||||||
|
CT::Array<T> ApartGroup_1 = A[{2, 2}]; |
||||||
|
CT::Array<T> ApartIndiv_1 = A[2][2]; |
||||||
|
for (uint32_t k = 0; k < 4; ++k) { |
||||||
|
for (uint32_t l = 0; l < 2; ++l) { |
||||||
|
for (uint32_t m = 0; m < 2; ++m) { |
||||||
|
if ((T)ApartIndiv_1[0][k][l][0][0][m] != (T)ApartGroup_1[{0, k, l, 0, 0, m}]) { |
||||||
|
++errors; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
msg.str(""); |
||||||
|
msg << "Errors: " << errors; |
||||||
|
TEST(errors == 0, "Axis (1/2)", msg.str().c_str()); |
||||||
|
|
||||||
|
errors = 0; |
||||||
|
CT::Array<T> ApartGroup_2 = A[{3, 2, 0, 3}]; |
||||||
|
CT::Array<T> ApartIndiv_2 = A[3][2][0][3]; |
||||||
|
|
||||||
|
for (uint32_t l = 0; l < 2; ++l) { |
||||||
|
for (uint32_t m = 0; m < 2; ++m) { |
||||||
|
if ((T)ApartIndiv_2[l][0][0][m] != (T)ApartGroup_2[{l, 0, 0, m}]) { |
||||||
|
++errors; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
msg.str(""); |
||||||
|
msg << "Errors: " << errors; |
||||||
|
TEST(errors == 0, "Axis (2/2)", msg.str().c_str()); |
||||||
|
return failed; |
||||||
|
}; |
||||||
|
|
||||||
|
static uint32_t Slicing() { |
||||||
|
uint32_t failed = 0; |
||||||
|
CT::Array<T> A = CT::Array<T>::constant({4, 5, 5}, 0); |
||||||
|
|
||||||
|
CT::Array<T> Aslice = A.slice({{0, 3}, {1, 4}, {1, 4}}); |
||||||
|
T num = (T)1; |
||||||
|
for (auto it = Aslice.begin(); it != Aslice.end(); ++it) { |
||||||
|
*it = num; |
||||||
|
++num; |
||||||
|
} |
||||||
|
|
||||||
|
CT::Array<T> Aslice2 = A[3].slice({{0, 5}, {0, 1}}); |
||||||
|
num = (T)-1; |
||||||
|
for (auto it = Aslice2.begin(); it != Aslice2.end(); ++it) { |
||||||
|
*it = num; |
||||||
|
--num; |
||||||
|
} |
||||||
|
|
||||||
|
uint32_t errors = 0; |
||||||
|
for (int i = 0; i < 3; ++i) { |
||||||
|
for (int j = 0; j < 3; ++j) { |
||||||
|
for (int k = 0; k < 3; ++k) { |
||||||
|
if ((T)A[i][1 + j][1 + k] != (T)(9 * i + 3 * j + k + 1)) { |
||||||
|
++errors; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Errors: " << errors; |
||||||
|
TEST(errors == 0, "Block", msg.str().c_str()); |
||||||
|
|
||||||
|
errors = 0; |
||||||
|
for (int i = 0; i < 5; ++i) { |
||||||
|
if ((T)A[3][i][0] != (T)(-(i + 1))) { |
||||||
|
++errors; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
msg.str(""); |
||||||
|
msg << "Errors: " << errors; |
||||||
|
TEST(errors == 0, "Column", msg.str().c_str()); |
||||||
|
return failed; |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
template <typename T> struct BLASTests { |
||||||
|
static double thres; |
||||||
|
static uint32_t GEMV(int attempts) { |
||||||
|
uint32_t failed = 0; |
||||||
|
for (int i = 0; i < attempts; i++) { |
||||||
|
CT::Shape Ashape = makeRandom2DShape(); |
||||||
|
CT::Shape xshape = CT::Shape({Ashape.cols(), 1}); |
||||||
|
CT::Shape yshape = CT::Shape({Ashape.rows(), 1}); |
||||||
|
|
||||||
|
CT::Array<T> A(Ashape); |
||||||
|
CT::Array<T> x(xshape); |
||||||
|
CT::Array<T> y(yshape); |
||||||
|
|
||||||
|
A.setRandom(-100, 100); |
||||||
|
x.setRandom(-100, 100); |
||||||
|
|
||||||
|
A.updateDevice(); |
||||||
|
x.updateDevice().wait(); |
||||||
|
|
||||||
|
CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait(); |
||||||
|
y.updateHost().wait(); |
||||||
|
|
||||||
|
CT::Array<T> yTest(yshape, true); |
||||||
|
yTest.eigenMap() = A.eigenMap() * x.eigenMap(); |
||||||
|
|
||||||
|
double norm = (y.eigenMap() - y.eigenMap()).norm(); |
||||||
|
|
||||||
|
std::ostringstream name; |
||||||
|
name << "GEMV (" << i + 1 << "/" << attempts << ")"; |
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Matrix Shape: " << Ashape << ", " |
||||||
|
<< "Residual: " << norm; |
||||||
|
TEST(norm < thres, name.str().c_str(), msg.str().c_str()); |
||||||
|
} |
||||||
|
return failed; |
||||||
|
}; |
||||||
|
|
||||||
|
static uint32_t GEMVBroadcast() { |
||||||
|
uint32_t failed = 0; |
||||||
|
CT::Shape Ashape = makeRandom2DShape(); |
||||||
|
CT::Shape xshape = CT::Shape({Ashape.cols(), 1}); |
||||||
|
CT::Shape yshape = CT::Shape({Ashape.rows(), 1}); |
||||||
|
|
||||||
|
CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()}); |
||||||
|
CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()}); |
||||||
|
CT::Array<T> y({2, 3, yshape.rows(), yshape.cols()}); |
||||||
|
|
||||||
|
A.setRandom(-100, 100); |
||||||
|
x.setRandom(-100, 100); |
||||||
|
|
||||||
|
A.updateDevice(); |
||||||
|
x.updateDevice().wait(); |
||||||
|
|
||||||
|
CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait(); |
||||||
|
y.updateHost().wait(); |
||||||
|
|
||||||
|
double norm = 0; |
||||||
|
CT::Array<T> yTest(yshape, true); |
||||||
|
for (int i = 0; i < 2; ++i) { |
||||||
|
for (int j = 0; j < 3; ++j) { |
||||||
|
yTest.eigenMap() = A[i][j].eigenMap() * x[i][j].eigenMap(); |
||||||
|
norm += (yTest.eigenMap() - y[i][j].eigenMap()).norm(); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Matrix Shape: " << Ashape << ", " |
||||||
|
<< "Residual: " << norm; |
||||||
|
TEST(norm < thres, "GEMV Broadcast", msg.str().c_str()); |
||||||
|
return failed; |
||||||
|
}; |
||||||
|
|
||||||
|
static uint32_t GEMM(int attempts) { |
||||||
|
uint32_t failed = 0; |
||||||
|
for (int i = 0; i < attempts; i++) { |
||||||
|
CT::Shape Ashape = makeRandom2DShape(); |
||||||
|
CT::Shape Bshape = makeRandom2DShape(); |
||||||
|
Bshape = CT::Shape({Ashape.cols(), Bshape.cols()}); |
||||||
|
|
||||||
|
CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()}); |
||||||
|
|
||||||
|
CT::Array<T> A(Ashape); |
||||||
|
CT::Array<T> B(Bshape); |
||||||
|
CT::Array<T> C(Cshape); |
||||||
|
|
||||||
|
A.setRandom(-100, 100); |
||||||
|
B.setRandom(-100, 100); |
||||||
|
C.setRandom(-100, 100); |
||||||
|
|
||||||
|
A.updateDevice(); |
||||||
|
B.updateDevice(); |
||||||
|
C.updateDevice().wait(); |
||||||
|
|
||||||
|
CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait(); |
||||||
|
C.updateHost().wait(); |
||||||
|
|
||||||
|
CT::Array<T> CTest(Cshape, true); |
||||||
|
CTest.eigenMap() = A.eigenMap() * B.eigenMap(); |
||||||
|
|
||||||
|
double norm = (CTest.eigenMap() - C.eigenMap()).norm(); |
||||||
|
|
||||||
|
std::ostringstream name; |
||||||
|
name << "GEMM (" << i + 1 << "/" << attempts << ")"; |
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Matrix Shapes: " << Ashape << Bshape << ", " |
||||||
|
<< "Residual: " << norm; |
||||||
|
TEST(norm < thres, name.str().c_str(), msg.str().c_str()); |
||||||
|
} |
||||||
|
return failed; |
||||||
|
}; |
||||||
|
|
||||||
|
static uint32_t GEMMBroadcast() { |
||||||
|
uint32_t failed = 0; |
||||||
|
CT::Shape Ashape = makeRandom2DShape(); |
||||||
|
CT::Shape Bshape = makeRandom2DShape(); |
||||||
|
Bshape = CT::Shape({Ashape.cols(), Bshape.cols()}); |
||||||
|
|
||||||
|
CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()}); |
||||||
|
|
||||||
|
CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()}); |
||||||
|
CT::Array<T> B({2, 3, Bshape.rows(), Bshape.cols()}); |
||||||
|
CT::Array<T> C({2, 3, Cshape.rows(), Cshape.cols()}); |
||||||
|
|
||||||
|
A.setRandom(-100, 100); |
||||||
|
B.setRandom(-100, 100); |
||||||
|
|
||||||
|
A.updateDevice(); |
||||||
|
B.updateDevice(); |
||||||
|
C.updateDevice().wait(); |
||||||
|
|
||||||
|
CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait(); |
||||||
|
C.updateHost().wait(); |
||||||
|
|
||||||
|
double norm = 0; |
||||||
|
CT::Array<T> CTest(Cshape, true); |
||||||
|
for (int i = 0; i < 2; ++i) { |
||||||
|
for (int j = 0; j < 3; ++j) { |
||||||
|
CTest.eigenMap() = A[i][j].eigenMap() * B[i][j].eigenMap(); |
||||||
|
norm += (CTest.eigenMap() - C[i][j].eigenMap()).norm(); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Matrix Shapes: " << Ashape << Bshape << ", " |
||||||
|
<< "Residual: " << norm; |
||||||
|
TEST(norm < thres, "GEMM Broadcast", msg.str().c_str()); |
||||||
|
return failed; |
||||||
|
}; |
||||||
|
|
||||||
|
static uint32_t PLU() { |
||||||
|
uint32_t failed = 0; |
||||||
|
CT::Shape Ashape = makeRandom2DShape(); |
||||||
|
CT::Shape xshape = makeRandom2DShape(); |
||||||
|
Ashape = CT::Shape({Ashape.rows(), Ashape.rows()}); |
||||||
|
xshape = CT::Shape({Ashape.rows(), xshape.cols()}); |
||||||
|
|
||||||
|
CT::Array<T> A({2, 3, Ashape.rows(), Ashape.rows()}); |
||||||
|
CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()}); |
||||||
|
CT::Array<T> b({2, 3, xshape.rows(), xshape.cols()}); |
||||||
|
CT::Array<T> Ax({2, 3, xshape.rows(), xshape.cols()}); |
||||||
|
|
||||||
|
A.setRandom(-100, 100); |
||||||
|
b.setRandom(-100, 100); |
||||||
|
|
||||||
|
CT::Array<T> LU(A.copy()); |
||||||
|
x = b; |
||||||
|
|
||||||
|
A.updateDevice(); |
||||||
|
LU.updateDevice(); |
||||||
|
x.updateDevice().wait(); |
||||||
|
|
||||||
|
CT::BLAS::PLUBatch<T> luBatch(LU); |
||||||
|
CT::BLAS::Batch<T> xBatch(x); |
||||||
|
luBatch.computeLU().wait(); |
||||||
|
luBatch.solve(xBatch).wait(); |
||||||
|
|
||||||
|
// Compute Ax and compare difference.
|
||||||
|
CT::BLAS::GEMM<T>(1.0, A, x, 0.0, Ax).wait(); |
||||||
|
Ax.updateHost(); |
||||||
|
|
||||||
|
double norm = 0; |
||||||
|
for (int i = 0; i < 2; ++i) { |
||||||
|
for (int j = 0; j < 3; ++j) { |
||||||
|
norm += (Ax[i][j].eigenMap() - b[i][j].eigenMap()).norm(); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
std::ostringstream msg; |
||||||
|
msg << "Matrix Shape: " << Ashape << xshape << ", " |
||||||
|
<< "Residual: " << norm; |
||||||
|
TEST(norm < thres, "PLU/Solve", msg.str().c_str()); |
||||||
|
return failed; |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
template <> double BLASTests<float>::thres = 10e-1; |
||||||
|
template <> double BLASTests<double>::thres = 10e-8; |
||||||
|
|
||||||
|
uint32_t doMacroTests() { |
||||||
|
uint32_t failed = 0; |
||||||
|
failed += MacroTests::Kernel(); |
||||||
|
failed += MacroTests::Class(); |
||||||
|
std::cout << "\n"; |
||||||
|
return failed; |
||||||
|
} |
||||||
|
|
||||||
|
template <typename T> uint32_t doArrayTests() { |
||||||
|
uint32_t failed = 0; |
||||||
|
std::cout << boxSmall("Index Tests : " + type<T>()) << "\n"; |
||||||
|
failed += ArrayTests<T>::Indexing(); |
||||||
|
std::cout << "\n" << boxSmall("Slice Tests : " + type<T>()) << "\n"; |
||||||
|
failed += ArrayTests<T>::Slicing(); |
||||||
|
std::cout << "\n"; |
||||||
|
return failed; |
||||||
|
} |
||||||
|
|
||||||
|
template <typename T> uint32_t doBLASTests() { |
||||||
|
uint32_t failed = 0; |
||||||
|
std::cout << boxSmall("GEMV Tests : " + type<T>()) << "\n"; |
||||||
|
failed += BLASTests<T>::GEMV(5); |
||||||
|
failed += BLASTests<T>::GEMVBroadcast(); |
||||||
|
|
||||||
|
std::cout << "\n" << boxSmall("GEMM Tests : " + type<T>()) << "\n"; |
||||||
|
failed += BLASTests<T>::GEMM(5); |
||||||
|
failed += BLASTests<T>::GEMMBroadcast(); |
||||||
|
|
||||||
|
std::cout << "\n" << boxSmall("PLU Tests : " + type<T>()) << "\n"; |
||||||
|
failed += BLASTests<T>::PLU(); |
||||||
|
std::cout << "\n"; |
||||||
|
return failed; |
||||||
|
} |
||||||
|
|
||||||
|
int main() { |
||||||
|
uint32_t failed = 0; |
||||||
|
std::cout << box("Macro Tests") << "\n"; |
||||||
|
failed += doMacroTests(); |
||||||
|
|
||||||
|
std::cout << box("Array Tests") << "\n"; |
||||||
|
// Test different sizes.
|
||||||
|
failed += doArrayTests<uint8_t>(); |
||||||
|
failed += doArrayTests<int16_t>(); |
||||||
|
failed += doArrayTests<int32_t>(); |
||||||
|
failed += doArrayTests<double>(); |
||||||
|
|
||||||
|
std::cout << box("BLAS Tests") << "\n"; |
||||||
|
failed += doBLASTests<float>(); |
||||||
|
failed += doBLASTests<double>(); |
||||||
|
|
||||||
|
constexpr uint32_t tests = 2 + 4 * 5 + 13 * 2; |
||||||
|
std::ostringstream msg; |
||||||
|
msg << ((failed == 0) ? "\033[1;32mPASS \033[0m(" : "\033[1;31mFAIL \033[0m(") |
||||||
|
<< (tests - failed) << "/" << tests << ")"; |
||||||
|
std::cout << box2(msg.str()) << "\n"; |
||||||
|
|
||||||
|
return 0; |
||||||
|
} |
Loading…
Reference in new issue