commit
b4e4a49d44
26 changed files with 6120 additions and 0 deletions
@ -0,0 +1,10 @@ |
||||
--- |
||||
BasedOnStyle: LLVM |
||||
IndentWidth: 4 |
||||
ColumnLimit: 100 |
||||
AllowShortIfStatementsOnASingleLine: true |
||||
--- |
||||
Language: Cpp |
||||
DerivePointerAlignment: false |
||||
PointerAlignment: Left |
||||
--- |
@ -0,0 +1,4 @@ |
||||
build |
||||
*CPU |
||||
*GPU |
||||
.venv |
@ -0,0 +1,777 @@ |
||||
#ifndef ARRAY_H |
||||
#define ARRAY_H |
||||
|
||||
#include "Core.h" |
||||
#include "Macros.h" |
||||
#include <Eigen/Dense> |
||||
#include <iomanip> |
||||
#include <math.h> |
||||
#include <random> |
||||
#include <type_traits> |
||||
|
||||
#ifdef DEVICE |
||||
#define POINTER pDevice |
||||
#else |
||||
#define POINTER pHost |
||||
#endif |
||||
|
||||
namespace CudaTools { |
||||
|
||||
template <typename T> |
||||
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>; |
||||
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>; |
||||
template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>; |
||||
|
||||
template <typename T> struct EigenAdaptConst { typedef EigenMapMat<T> type; }; |
||||
template <typename T> struct EigenAdaptConst<const T> { typedef ConstEigenMapMat<T> type; }; |
||||
|
||||
#define ENABLE_IF(X) std::enable_if_t<X, bool> |
||||
#define IS_INT(T) std::is_integral<T>::value |
||||
#define IS_FLOAT(T) std::is_floating_point<T>::value |
||||
#define IS_NUM(T) IS_INT(T) or IS_FLOAT(T) |
||||
|
||||
template <typename T> class Array; |
||||
using Slice = std::pair<uint32_t, uint32_t>; |
||||
|
||||
template <typename T> class ArrayIterator { |
||||
private: |
||||
template <typename U> |
||||
friend std::ostream& operator<<(std::ostream& out, const ArrayIterator<U>& it); |
||||
T* pData; |
||||
Shape mShape; |
||||
uint32_t mIndices[CUDATOOLS_ARRAY_MAX_AXES] = {0}; |
||||
|
||||
public: |
||||
HD ArrayIterator(T* p, const Shape& shape) : pData(p), mShape(shape){}; |
||||
|
||||
/**
|
||||
* Moves the iterator to the next value. |
||||
*/ |
||||
HD void next() { |
||||
bool carry = false; |
||||
uint32_t offset = 0; |
||||
for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { |
||||
if (mIndices[iAxis] == mShape.dim(iAxis) - 1) { |
||||
mIndices[iAxis] = 0; |
||||
offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); |
||||
carry = true; |
||||
} else { |
||||
pData += mShape.stride(iAxis); |
||||
mIndices[iAxis] += 1; |
||||
carry = false; |
||||
} |
||||
|
||||
if (not carry) { |
||||
pData -= offset; |
||||
return; |
||||
} |
||||
} |
||||
pData += 1; // "Overflow" occured, so we reached end of array.
|
||||
} |
||||
|
||||
/**
|
||||
* Moves the iterator to the previous value. |
||||
*/ |
||||
HD void prev() { |
||||
bool carry = false; |
||||
uint32_t offset = 0; |
||||
for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { |
||||
if (mIndices[iAxis] == 0) { |
||||
mIndices[iAxis] = mShape.dim(iAxis) - 1; |
||||
offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); |
||||
carry = true; |
||||
} else { |
||||
pData -= mShape.stride(iAxis); |
||||
mIndices[iAxis] += 1; |
||||
carry = false; |
||||
} |
||||
if (not carry) { |
||||
pData += offset; |
||||
return; |
||||
} |
||||
} |
||||
pData -= 1; |
||||
} |
||||
|
||||
/**
|
||||
* Moves the iterator a specified value away. |
||||
* \param amount the amount to advance by |
||||
*/ |
||||
HD void advance(const int32_t amount) { |
||||
if (amount < 0) { |
||||
for (uint32_t i = 0; i < abs(amount); ++i) { |
||||
prev(); |
||||
} |
||||
} else { |
||||
for (uint32_t i = 0; i < abs(amount); ++i) { |
||||
next(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
HD void operator++() { next(); }; /**< Prefix increment operator. */ |
||||
HD void operator--() { prev(); }; /**< Prefix decrement operator. */ |
||||
|
||||
/**< Addition operator. */ |
||||
HD ArrayIterator<T> operator+(const int32_t v) const { |
||||
ArrayIterator<T> it = *this; |
||||
it.advance(v); |
||||
return it; |
||||
}; |
||||
|
||||
/** Subtraction operator.*/ |
||||
HD ArrayIterator<T> operator-(const int32_t v) const { |
||||
ArrayIterator<T> it = *this; |
||||
it.advance(-v); |
||||
return it; |
||||
}; |
||||
HD void operator+=(const int32_t v) { advance(v); }; |
||||
HD void operator-=(const int32_t v) { advance(-v); }; |
||||
|
||||
HD T& operator*() { return *pData; }; /**< Dereference operator. */ |
||||
HD const T& operator*() const { return *pData; }; /**< Const dereference operator. */ |
||||
|
||||
/**
|
||||
* Equals operator. |
||||
*/ |
||||
HD bool operator==(const ArrayIterator<T>& it) { return pData == it.pData; } |
||||
|
||||
/**
|
||||
* Not equals operator. |
||||
*/ |
||||
HD bool operator!=(const ArrayIterator<T>& it) { return pData != it.pData; } |
||||
}; |
||||
|
||||
template <typename T> std::ostream& operator<<(std::ostream& out, const ArrayIterator<T>& it) { |
||||
return out << it.pData; |
||||
} |
||||
|
||||
template <typename T> class ArrayLoader { |
||||
private: |
||||
ArrayIterator<T> mIterator; |
||||
ArrayIterator<T> mIteratorEnd; |
||||
|
||||
public: |
||||
HD ArrayLoader(const ArrayIterator<T>& it, const ArrayIterator<T>& it_end) |
||||
: mIterator(it), mIteratorEnd(it_end){}; |
||||
HD ArrayLoader &operator,(const T value) { |
||||
CT_ERROR_IF(mIterator, ==, mIteratorEnd, "Cannot assign more values than Array size"); |
||||
*mIterator = value; |
||||
++mIterator; |
||||
return *this; |
||||
} |
||||
}; |
||||
|
||||
/**
|
||||
* A container that holds a N-dimensional array, stored column major. To set the |
||||
* maximum N, there is a compiler macro CUDATOOLS_ARRAY_MAX_DIM whose default value is 4. |
||||
* It adapts to operations between host and device to ease memory management. |
||||
*/ |
||||
template <typename T> class Array { |
||||
private: |
||||
template <typename U> friend std::ostream& operator<<(std::ostream&, const Array<U>&); |
||||
|
||||
Shape mShape; |
||||
T* pHost = nullptr; |
||||
T* pDevice = nullptr; |
||||
|
||||
bool mIsView = false; |
||||
bool mIsSlice = false; |
||||
|
||||
uint32_t mEndOffset = 0; |
||||
|
||||
void freeArrays() { |
||||
#ifndef DEVICE |
||||
if (not mIsView) { |
||||
if (pDevice != nullptr) CudaTools::free(pDevice); |
||||
if (pHost != nullptr) delete[] pHost; |
||||
} |
||||
#endif |
||||
}; |
||||
|
||||
HD void calcEnd() { |
||||
uint32_t offset = 0; |
||||
for (uint32_t i = 0; i < shape().axes(); ++i) { |
||||
offset += (shape().dim(i) - 1) * shape().stride(i); |
||||
} |
||||
mEndOffset = offset + 1; |
||||
}; |
||||
|
||||
public: |
||||
HD Array() = default; |
||||
|
||||
/**
|
||||
* Constructor for an Array that creates an allocates an array with |
||||
* the specified Shape. Construction in this format is disabled on the device. |
||||
* \brief Host only |
||||
* \param shape the shape of the array |
||||
* \param noDevice whether to initialize the array on the device |
||||
*/ |
||||
Array(const Shape& shape, const bool noDevice = false) : mShape(shape), mIsView(false) { |
||||
pHost = new T[shape.items()]; |
||||
calcEnd(); |
||||
if (noDevice) return; |
||||
pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); |
||||
}; |
||||
|
||||
/**
|
||||
* Constructor for an Array from an existing (preallocated) pointer. |
||||
* \param pointer the pointer to use |
||||
* \param shape the shape of the array |
||||
* \param noDevice whether to initialize the array on the device |
||||
*/ |
||||
HD Array(T* const pointer, const Shape& shape, const bool noDevice = false) |
||||
: mShape(shape), mIsView(true), mIsSlice(false) { |
||||
POINTER = pointer; |
||||
calcEnd(); |
||||
#ifndef DEVICE |
||||
if (noDevice) return; |
||||
pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); |
||||
#endif |
||||
}; |
||||
|
||||
/**
|
||||
* Constructor for making a Array view from another Array, |
||||
* given an offset and shape. |
||||
* \param arr the original Array |
||||
* \param shape the shape of the new array |
||||
* \param offset the index where to start the a view of the array |
||||
*/ |
||||
HD Array(const Array& arr, const Shape& shape, const uint32_t offset = 0) |
||||
: mShape(shape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(true), |
||||
mIsSlice(arr.mIsSlice) { |
||||
calcEnd(); |
||||
if (pHost != nullptr) pHost += offset; |
||||
if (pDevice != nullptr) pDevice += offset; |
||||
}; |
||||
|
||||
/**
|
||||
* The copy-constructor for a Array. If this is not a view, a deep copy |
||||
* of the data will be performed on both host and device. On the device, it is always |
||||
* treated like a view. |
||||
*/ |
||||
HD Array(const Array& arr) : mShape(arr.mShape), mIsView(arr.mIsView), mIsSlice(arr.mIsSlice) { |
||||
calcEnd(); |
||||
if (mIsView) { // If the other array was a view (and now this one), just assign.
|
||||
pHost = arr.pHost; |
||||
pDevice = arr.pDevice; |
||||
return; |
||||
} |
||||
|
||||
// Otherwise, we assume this is needs to own data.
|
||||
pHost = new T[mShape.items()]; |
||||
auto arr_it = arr.begin(); |
||||
for (auto it = begin(); it != end(); ++it) { |
||||
*it = *arr_it; |
||||
++arr_it; |
||||
} |
||||
|
||||
#ifndef DEVICE |
||||
if (arr.pDevice != nullptr) { |
||||
pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); |
||||
} |
||||
#endif |
||||
}; |
||||
|
||||
/**
|
||||
* The move-constructor for a Array. |
||||
*/ |
||||
HD Array(Array&& arr) |
||||
: mShape(arr.mShape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(arr.mIsView), |
||||
mIsSlice(arr.mIsSlice) { |
||||
calcEnd(); |
||||
// Make other object empty.
|
||||
arr.pHost = nullptr; |
||||
arr.pDevice = nullptr; |
||||
arr.mIsView = true; |
||||
}; |
||||
|
||||
HD ~Array() { freeArrays(); }; |
||||
|
||||
/**
|
||||
* The copy-assignment operator for a Array. If this is not a view, |
||||
* then the currently owned data will be freed, and a deep copy of the data will |
||||
* be performed on both host and device. On the device, it is always treated like a view. |
||||
*/ |
||||
HD Array& operator=(const Array& arr) { |
||||
if (this == &arr) return *this; |
||||
|
||||
if (mIsView) { // If this array is a view, we assign data from the right-hand side.
|
||||
auto arr_it = arr.begin(); |
||||
for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { |
||||
*it = *arr_it; |
||||
++arr_it; |
||||
} |
||||
return *this; |
||||
} |
||||
|
||||
// Otherwise, it is implied to be object reassignment.
|
||||
mShape = arr.mShape; |
||||
mIsView = arr.mIsView; |
||||
mIsSlice = arr.mIsSlice; |
||||
calcEnd(); |
||||
|
||||
// Regardless if the right-hand side is a view, we create a new copy.
|
||||
// In case that the right-hand side is a view of this array, we
|
||||
// allocate memory to copy first. Keep in mind that the right-hand side
|
||||
// array will then become undefined.
|
||||
|
||||
// We can only do this on the host.
|
||||
#ifndef DEVICE |
||||
T* new_pDevice = nullptr; |
||||
if (pDevice != nullptr) { |
||||
new_pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); |
||||
} |
||||
|
||||
T* new_pHost = new T[mShape.items()]; |
||||
memcpy(new_pHost, arr.pHost, mShape.items() * sizeof(T)); |
||||
|
||||
freeArrays(); |
||||
pHost = new_pHost; |
||||
pDevice = new_pDevice; |
||||
#else |
||||
pHost = arr.pHost; |
||||
pDevice = arr.pDevice; |
||||
#endif |
||||
return *this; |
||||
}; |
||||
|
||||
/**
|
||||
* The move-assignment operator for a Array. |
||||
*/ |
||||
HD Array& operator=(Array&& arr) { |
||||
if (this == &arr) return *this; |
||||
|
||||
if (mIsView) { // If this array is a view, we assign data from the right-hand side.
|
||||
auto arr_it = arr.begin(); |
||||
for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { |
||||
*it = *arr_it; |
||||
++arr_it; |
||||
} |
||||
return *this; |
||||
} |
||||
|
||||
CT_ERROR(arr.mIsView, |
||||
"Cannot move-assign view to a non-view (owner). This would lead to undefined " |
||||
"behavior."); |
||||
|
||||
// Otherwise, it is implied to be object reassignment.
|
||||
freeArrays(); |
||||
mShape = arr.mShape; |
||||
pHost = arr.pHost; |
||||
pDevice = arr.pDevice; |
||||
mIsView = arr.mIsView; |
||||
mIsSlice = arr.mIsSlice; |
||||
calcEnd(); |
||||
|
||||
// Make other array empty.
|
||||
arr.pHost = nullptr; |
||||
arr.pDevice = nullptr; |
||||
arr.mIsView = true; |
||||
return *this; |
||||
}; |
||||
|
||||
/**
|
||||
* Used for indexing the Array. |
||||
* \param index index of the first dimension |
||||
*/ |
||||
HD Array operator[](const uint32_t index) const { |
||||
CT_ERROR_IF(index, >=, shape().dim(0), "Index exceeds axis size"); |
||||
return Array(*this, shape().subshape(1), index * shape().stride(0)); |
||||
}; |
||||
|
||||
/**
|
||||
* Used for indexing the Array. |
||||
* \param indices a list of indices to index the Array |
||||
*/ |
||||
HD Array operator[](const std::initializer_list<uint32_t> indices) const { |
||||
CT_ERROR_IF(indices.size(), >, shape().axes(), |
||||
"Number of indices cannot exceed number of axes"); |
||||
auto it = indices.begin(); |
||||
uint offset = 0; |
||||
for (uint32_t i = 0; i < indices.size(); ++i) { |
||||
uint32_t index = *it; |
||||
CT_ERROR_IF(index, >=, shape().dim(i), "Index exceeds axis size"); |
||||
offset += index * shape().stride(i); |
||||
++it; |
||||
} |
||||
return Array(*this, shape().subshape(indices.size()), offset); |
||||
}; |
||||
|
||||
HD ArrayLoader<T> operator<<(const T value) { |
||||
auto it = begin(); |
||||
*it = value; |
||||
++it; |
||||
return ArrayLoader<T>(it, end()); |
||||
}; |
||||
|
||||
HD T operator=(const T& value) { return POINTER[0] = value; }; |
||||
HD operator T&() { return POINTER[0]; }; |
||||
HD operator const T&() const { return POINTER[0]; }; |
||||
|
||||
/**
|
||||
* Used to create slices of the Array. |
||||
* \param slices a list of slices to slice the Array |
||||
*/ |
||||
HD Array slice(const std::initializer_list<Slice> slices) const { |
||||
CT_ERROR_IF(slices.size(), >, shape().axes(), |
||||
"Number of slices cannot exceed number of axes"); |
||||
|
||||
uint offset = 0; |
||||
Shape new_shape = mShape; |
||||
auto it = slices.begin(); |
||||
for (uint32_t i = 0; i < slices.size(); ++i) { |
||||
uint32_t from_index = it->first; |
||||
uint32_t to_index = it->second; |
||||
CT_ERROR_IF(from_index, >, to_index, |
||||
"Slice start cannot be greater than than slice end"); |
||||
CT_ERROR_IF(from_index, >=, shape().dim(i), "Slice start exceeds axis size"); |
||||
CT_ERROR_IF(to_index - 1, >=, shape().dim(i), "Slice end exceeds axis size"); |
||||
|
||||
offset += from_index * shape().stride(i); |
||||
new_shape.mAxisDim[i] = to_index - from_index; |
||||
++it; |
||||
} |
||||
new_shape.mItems = 1; |
||||
for (uint32_t i = 0; i < shape().axes(); ++i) { |
||||
new_shape.mItems *= new_shape.dim(i); |
||||
} |
||||
|
||||
Array<T> arr(*this, new_shape, offset); |
||||
arr.mIsSlice = true; |
||||
return arr; |
||||
}; |
||||
|
||||
/**
|
||||
* Returns this Array with a different Shape. Its self assigning version is reshape. |
||||
* If this Array is a slice of another, then it will perform a deep copy, and return |
||||
* a new non-view array. |
||||
*/ |
||||
HD Array reshaped(const Shape& new_shape) const { |
||||
CT_ERROR_IF(shape().items(), !=, new_shape.items(), |
||||
"New shape cannot have a different number of terms"); |
||||
if (mIsSlice) { |
||||
Array<T> arr = this->copy(); |
||||
return arr.reshaped(new_shape); |
||||
} |
||||
Array<T> arr = view(); |
||||
arr.mShape = new_shape; |
||||
return arr; |
||||
}; |
||||
|
||||
HD void reshape(const Shape& new_shape) { |
||||
CT_ERROR_IF(shape().items(), !=, new_shape.items(), |
||||
"New shape cannot have a different number of terms"); |
||||
CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try reshaped instead)") |
||||
mShape = new_shape; |
||||
}; |
||||
|
||||
/**
|
||||
* Gets a view that is has at least two dimensions. Useful for promoting |
||||
* single vectors to their 2D counterparts. |
||||
*/ |
||||
HD Array atLeast2D() const { |
||||
return (shape().axes() == 1) ? Array(*this, {shape().length(), 1}) : view(); |
||||
}; |
||||
|
||||
/**
|
||||
* Flattens the Array into one dimension. |
||||
*/ |
||||
HD Array flatten() const { return reshape({mShape.mItems}); }; |
||||
|
||||
/**
|
||||
* Returns the Eigen::Map of this Array. |
||||
*/ |
||||
typename EigenAdaptConst<T>::type eigenMap() const { |
||||
uint32_t total_dim = mShape.mAxes; |
||||
CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices") |
||||
CT_ERROR_IF(total_dim, !=, 2, |
||||
"Mapping to an Eigen array can only occur on two-dimensional arrays"); |
||||
return typename EigenAdaptConst<T>::type(POINTER, mShape.rows(), mShape.cols()); |
||||
}; |
||||
|
||||
/**
|
||||
* Gets the Shape of the Array. |
||||
*/ |
||||
HD Shape shape() const { return mShape; }; |
||||
|
||||
/**
|
||||
* Gets the pointer to this array, depending on host or device. |
||||
*/ |
||||
HD T* data() const { return POINTER; }; |
||||
|
||||
/**
|
||||
* Returns the device pointer regardless of host or device. |
||||
*/ |
||||
HD T* dataDevice() const { return pDevice; }; |
||||
|
||||
HD bool isView() const { return mIsView; }; /**< Gets whether this Array is a view. */ |
||||
HD bool isSlice() const { return mIsSlice; }; /**< Gets whether this Array is a slice. */ |
||||
|
||||
/**
|
||||
* Gets a view of this Array. |
||||
*/ |
||||
HD Array view() const { return Array(*this, mShape); } |
||||
|
||||
/**
|
||||
* Copies this Array and returns a new Array with the same memory. |
||||
*/ |
||||
HD Array copy() const { |
||||
Array<T> arr(mShape, (pDevice == nullptr)); |
||||
|
||||
auto arr_it = arr.begin(); |
||||
for (auto it = begin(); it != end(); ++it) { |
||||
*arr_it = *it; |
||||
++arr_it; |
||||
} |
||||
#ifndef DEVICE |
||||
if (pDevice != nullptr) { |
||||
CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait(); |
||||
} |
||||
#endif |
||||
return arr; |
||||
}; |
||||
|
||||
/**
|
||||
* Gets the iterator to the beginning of this Array. |
||||
*/ |
||||
HD ArrayIterator<T> begin() const { return ArrayIterator<T>(POINTER, mShape); }; |
||||
|
||||
/**
|
||||
* Gets the iterator to the end of this Array. |
||||
*/ |
||||
HD ArrayIterator<T> end() const { return ArrayIterator<T>(POINTER + mEndOffset, mShape); }; |
||||
|
||||
/**
|
||||
* Sets the values of the entire Array to a constant. This is restricted to numerical types. |
||||
*/ |
||||
HD void setConstant(const T value) const { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
for (auto it = begin(); it != end(); ++it) { |
||||
*it = value; |
||||
} |
||||
}; |
||||
|
||||
/**
|
||||
* Sets the Array values with uniform random values in a specified range. This is restricted to |
||||
* numerical types. |
||||
* \brief Host only |
||||
*/ |
||||
void setRandom(const T min, const T max) const { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); |
||||
std::random_device rd; |
||||
std::mt19937 mt(rd()); |
||||
if constexpr (IS_INT(T)) { |
||||
std::uniform_int_distribution<T> dist(min, max); |
||||
for (auto it = begin(); it != end(); ++it) { |
||||
*it = dist(mt); |
||||
} |
||||
} else if constexpr (IS_FLOAT(T)) { |
||||
std::uniform_real_distribution<T> dist(min, max); |
||||
for (auto it = begin(); it != end(); ++it) { |
||||
*it = dist(mt); |
||||
} |
||||
} |
||||
}; |
||||
|
||||
/**
|
||||
* Sets the Array values to start from a value and increment by a specified step. This is |
||||
* restricted to numerical types. |
||||
*/ |
||||
HD void setRange(T min, const T step = 1) const { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
for (auto it = begin(); it != end(); ++it) { |
||||
*it = min; |
||||
min += step; |
||||
} |
||||
} |
||||
/**
|
||||
* Sets the Array values to be evenly spaced numbers over a given interval. This is restricted |
||||
* to floating point types. |
||||
*/ |
||||
HD void setLinspace(const T min, const T max) const { |
||||
static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); |
||||
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); |
||||
T i = 0; |
||||
T d = max - min; |
||||
T items = (T)(shape().items() - 1); |
||||
for (auto it = begin(); it != end(); ++it) { |
||||
*it = min + d * (i / items); |
||||
i += 1; |
||||
} |
||||
}; |
||||
|
||||
/**
|
||||
* Returns array of given shape with constant values. This is restricted to numerical types. |
||||
* \brief Host only |
||||
*/ |
||||
static Array constant(const Shape& shape, const T value) { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
Array<T> arr(shape); |
||||
arr.setConstant(value); |
||||
return arr; |
||||
}; |
||||
|
||||
/**
|
||||
* Returns array of given shape with random values in given interval. This is restricted to |
||||
* numerical types. |
||||
* \brief Host only |
||||
*/ |
||||
static Array random(const Shape& shape, const T min, const T max) { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
Array<T> arr(shape); |
||||
arr.setRandom(min, max); |
||||
return arr; |
||||
}; |
||||
|
||||
/**
|
||||
* Returns evenly spaced values within a given interval. This is restricted to numerical types. |
||||
* \brief Host only |
||||
*/ |
||||
static Array range(const T min, const T max, const T step = 1) { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); |
||||
Array<T> arr({(uint32_t)((max - min) / step)}); |
||||
arr.setRange(min, step); |
||||
return arr; |
||||
} |
||||
|
||||
/**
|
||||
* Returns evenly spaced values within a given interval. This is restricted to floating point |
||||
* types. |
||||
* \brief Host only |
||||
*/ |
||||
static Array linspace(const T min, const T max, const uint32_t size) { |
||||
static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); |
||||
Array<T> arr({size}); |
||||
arr.setLinspace(min, max); |
||||
return arr; |
||||
} |
||||
|
||||
/**
|
||||
* Transposes the internal data and returns the corresponding new Array. |
||||
* Its self assigning version is transpose. This is restricted to numerical types. |
||||
* \brief Host only |
||||
*/ |
||||
Array transposed() const { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); |
||||
Array<T> new_arr({mShape.rows(), mShape.cols()}); |
||||
new_arr.eigenMap() = this->eigenMap().transpose().eval(); |
||||
return new_arr; |
||||
}; |
||||
|
||||
/**
|
||||
* Transposes the intenal data. Its self assigning version is transpose. |
||||
* This is restricted to numerical types. |
||||
* \brief Host only |
||||
*/ |
||||
void transpose() { |
||||
static_assert(IS_NUM(T), "Function only available on numeric types."); |
||||
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); |
||||
Array<T> new_arr(*this, {mShape.cols(), mShape.rows()}); |
||||
new_arr.eigenMap() = this->eigenMap().transpose().eval(); |
||||
mShape = Shape({mShape.cols(), mShape.rows()}); |
||||
}; |
||||
|
||||
void inverse() const { |
||||
static_assert(IS_FLOAT(T), "Function only available on floating numeric types."); |
||||
CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays"); |
||||
CT_ERROR_IF(shape().rows(), !=, shape().cols(), |
||||
"Inverse can only occur on square matrices"); |
||||
Array<T> inv(shape()); |
||||
inv.eigenMap() = this->eigenMap().inverse(); |
||||
}; |
||||
|
||||
/**
|
||||
* Pins the memory (page locks) for faster memory transfer in concurrent |
||||
* transfers. |
||||
* \brief Host only |
||||
*/ |
||||
void pinMemory() const { CudaTools::pin(pHost, mShape.items() * sizeof(T)); }; |
||||
|
||||
/**
|
||||
* Updates the host copy by copying the device data back to the host. |
||||
* \brief Host only |
||||
*/ |
||||
StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const { |
||||
CT_ERROR(mIsView, "Cannot update host on a view"); |
||||
CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream); |
||||
return stream; |
||||
}; |
||||
|
||||
/**
|
||||
* Updates the device copy by copying the host data to the device. |
||||
* \brief Host only |
||||
*/ |
||||
StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const { |
||||
CT_ERROR(mIsView, "Cannot update device on a view"); |
||||
CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream); |
||||
return stream; |
||||
}; |
||||
}; |
||||
|
||||
template <typename T> |
||||
void printAxis(std::ostream& out, const Array<T>& arr, const uint32_t axis, size_t width) { |
||||
std::string space = std::string(2 * axis, ' '); |
||||
if (arr.shape().axes() == 1) { |
||||
out << "["; |
||||
for (uint32_t i = 0; i < arr.shape().items(); ++i) { |
||||
if constexpr (std::is_floating_point<T>::value) { |
||||
out << std::scientific << std::setprecision(6); |
||||
} |
||||
if (width == 0) { |
||||
out << ((i == 0) ? "" : " "); |
||||
} else { |
||||
out << std::setw((i == 0) ? width - 1 : width); |
||||
} |
||||
out << (T)arr[i] << ((i == arr.shape().items() - 1) ? "]" : ","); |
||||
} |
||||
} else if (arr.shape().axes() == 2) { |
||||
for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { |
||||
out << space << ((i == 0) ? "[" : " "); |
||||
printAxis(out, arr[i], axis + 1, width); |
||||
out << ((i == arr.shape().dim(0) - 1) ? "]" : ",\n"); |
||||
} |
||||
} else { |
||||
out << space << "[\n"; |
||||
for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { |
||||
printAxis(out, arr[i], axis + 1, width); |
||||
out << ((i == arr.shape().dim(0) - 1) ? "\n" : ",\n\n"); |
||||
} |
||||
out << space << "]"; |
||||
} |
||||
} |
||||
|
||||
template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>& arr) { |
||||
size_t width = 0; |
||||
if constexpr (IS_NUM(T)) { |
||||
T max_val = 0; |
||||
bool negative = false; |
||||
for (auto it = arr.begin(); it != arr.end(); ++it) { |
||||
if (*it < 0) negative = true; |
||||
max_val = (abs(*it) > max_val) ? abs(*it) : max_val; |
||||
} |
||||
width = std::to_string(max_val).size() + 1; |
||||
width += (negative) ? 1 : 0; |
||||
} else if constexpr (IS_FLOAT(T)) { |
||||
T max_val = 0; |
||||
bool negative = false; |
||||
for (auto it = arr.begin(); it != arr.end(); ++it) { |
||||
if (*it < 0) negative = true; |
||||
int exp = 0; |
||||
frexp(*it, &exp); |
||||
max_val = (exp > max_val) ? exp : max_val; |
||||
} |
||||
width = std::to_string(max_val).size() + 5; |
||||
width += (negative) ? 1 : 0; |
||||
} |
||||
|
||||
printAxis<T>(out, arr, 0, (arr.shape().axes() == 1) ? 0 : width); |
||||
return out; |
||||
} |
||||
|
||||
}; // namespace CudaTools
|
||||
|
||||
#endif // ARRAY_H
|
@ -0,0 +1,600 @@ |
||||
#ifndef BLAS_H |
||||
#define BLAS_H |
||||
|
||||
#include "Array.h" |
||||
#include "Core.h" |
||||
#include "Macros.h" |
||||
|
||||
namespace CudaTools { |
||||
|
||||
namespace BLAS { |
||||
|
||||
struct BatchInfo { |
||||
uint32_t strideA, strideB, strideC; |
||||
uint32_t size; |
||||
}; |
||||
|
||||
template <typename T> struct Check { |
||||
static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") { |
||||
CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str()); |
||||
}; |
||||
|
||||
static void isSquare(const Array<T>& arr, const std::string& name = "Array") { |
||||
isAtLeast2D(arr, name); |
||||
CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str()) |
||||
}; |
||||
|
||||
static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C, |
||||
const std::string& nameA = "A", const std::string& nameB = "B", |
||||
const std::string nameC = "C") { |
||||
isAtLeast2D(A, nameA); |
||||
isAtLeast2D(B, nameB); |
||||
isAtLeast2D(C, nameB); |
||||
CT_ERROR_IF(A.shape().cols(), !=, B.shape().rows(), |
||||
(nameA + nameB + " is not a valid matrix multiplication").c_str()); |
||||
|
||||
Shape ABshape({A.shape().rows(), B.shape().cols()}); |
||||
Shape Cshape({C.shape().rows(), C.shape().cols()}); |
||||
|
||||
CT_ERROR_IF( |
||||
ABshape, !=, Cshape, |
||||
("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str()); |
||||
}; |
||||
|
||||
static uint32_t getUpperItems(const Array<T>& arr) { |
||||
uint32_t upperItems = 1; |
||||
for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) { |
||||
upperItems *= arr.shape().dim(iAxis); |
||||
} |
||||
return upperItems; |
||||
}; |
||||
|
||||
static void matchUpperShape(const Array<T>& A, const Array<T>& B, |
||||
const std::string& nameA = "A", const std::string& nameB = "B") { |
||||
CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(), |
||||
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); |
||||
for (uint32_t iAxis = 0; iAxis < A.shape().axes() - 2; ++iAxis) { |
||||
uint32_t Adim = A.shape().dim(iAxis); |
||||
uint32_t Bdim = B.shape().dim(iAxis); |
||||
CT_ERROR_IF( |
||||
Adim, !=, Bdim, |
||||
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); |
||||
} |
||||
}; |
||||
|
||||
static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C, |
||||
const std::string& nameA = "A", const std::string& nameB = "B", |
||||
const std::string nameC = "C") { |
||||
isValidMatmul(A, B, C, nameA, nameB, nameC); |
||||
uint32_t itemsA = getUpperItems(A); |
||||
uint32_t itemsB = getUpperItems(B); |
||||
uint32_t itemsC = getUpperItems(C); |
||||
|
||||
uint32_t Asize = A.shape().rows() * A.shape().cols(); |
||||
uint32_t Bsize = B.shape().rows() * B.shape().cols(); |
||||
uint32_t Csize = C.shape().rows() * C.shape().cols(); |
||||
|
||||
if (itemsA == itemsB) { |
||||
CT_ERROR_IF(itemsA, !=, itemsC, |
||||
("Incorrect dimensions to broadcast to output " + nameC).c_str()); |
||||
matchUpperShape(A, B, nameA, nameB); |
||||
matchUpperShape(A, C, nameA, nameC); |
||||
return BatchInfo{Asize, Bsize, Csize, itemsC}; |
||||
} else if (itemsA > itemsB) { |
||||
CT_ERROR_IF( |
||||
itemsB, !=, 1, |
||||
("Cannot broadcast operation to " + nameB + " with non-matching " + nameA).c_str()); |
||||
CT_ERROR_IF(itemsA, !=, itemsC, |
||||
("Incorrect dimensions to broadcast to output " + nameC).c_str()); |
||||
matchUpperShape(A, C, nameA, nameC); |
||||
return BatchInfo{Asize, 0, Csize, itemsC}; |
||||
} else { |
||||
CT_ERROR_IF( |
||||
itemsA, !=, 1, |
||||
("Cannot broadcast operation to " + nameA + " with non-matching " + nameB).c_str()); |
||||
CT_ERROR_IF(itemsA, !=, itemsC, |
||||
("Incorrect dimensions to broadcast to output " + nameC).c_str()); |
||||
matchUpperShape(B, C, nameB, nameC); |
||||
return BatchInfo{0, Bsize, Csize, itemsC}; |
||||
} |
||||
}; |
||||
}; |
||||
|
||||
/**
|
||||
* Represents a Batch of Arrays with the same shape. Mainly used for cuBLAS functions. |
||||
*/ |
||||
template <typename T> class Batch { |
||||
protected: |
||||
Array<T*> mBatch; |
||||
Shape mShape; |
||||
|
||||
uint32_t mCount = 0; |
||||
uint32_t mBatchSize; |
||||
|
||||
public: |
||||
Batch() = delete; |
||||
|
||||
/**
|
||||
* Constructs a batch from a given size. |
||||
*/ |
||||
Batch(const uint32_t size) : mBatchSize(size){}; |
||||
|
||||
/**
|
||||
* Constructs a batch from a non-view Array. |
||||
*/ |
||||
Batch(const Array<T>& arr) { |
||||
CT_ERROR(arr.isView(), "Array cannot be a view"); |
||||
mShape = Shape({arr.shape().rows(), arr.shape().cols()}); |
||||
mBatchSize = mCount = Check<T>::getUpperItems(arr); |
||||
|
||||
mBatch = Array<T*>({mBatchSize}); |
||||
|
||||
Array<T> batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()}); |
||||
for (uint32_t i = 0; i < mBatchSize; ++i) { |
||||
#ifdef CUDA |
||||
mBatch[i] = batch[i].dataDevice(); |
||||
#else |
||||
mBatch[i] = batch[i].data(); |
||||
#endif |
||||
} |
||||
|
||||
mBatch.updateDevice().wait(); |
||||
}; |
||||
|
||||
/**
|
||||
* Adds a matrix to the batch. Array must be a view. |
||||
*/ |
||||
void add(const Array<T>& arr) { |
||||
CT_ERROR(not arr.isView(), "Cannot add non-view Arrays"); |
||||
CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays"); |
||||
#ifdef CUDA |
||||
mBatch[mCount] = arr.dataDevice(); |
||||
#else |
||||
mBatch[mCount] = arr.data(); |
||||
#endif |
||||
if (mCount == 0) { |
||||
mShape = arr.shape(); |
||||
mBatchSize = mCount = Check<T>::getUpperItems(arr); |
||||
} else { |
||||
CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch"); |
||||
} |
||||
++mCount; |
||||
|
||||
if (mCount == mBatchSize) { |
||||
mBatch.updateDevice().wait(); |
||||
} |
||||
}; |
||||
|
||||
/**
|
||||
* Indexing operator which returns a view of the Array in the Batch at the given index. |
||||
*/ |
||||
Array<T> operator[](const uint32_t index) const { |
||||
CT_ERROR_IF(index, >=, mBatchSize, "Index exceeds batch size"); |
||||
return Array<T>(mBatch[index], {mShape.rows(), mShape.cols()}); |
||||
}; |
||||
|
||||
/**
|
||||
* Returns the batch Array of pointers. |
||||
*/ |
||||
Array<T*> batch() const { return mBatch.view(); }; |
||||
Shape shape() const { return mShape; } /**< Gets the shape of the matrices in the batch. */ |
||||
uint32_t size() const { return mBatchSize; } /**< Gets the batch size.*/ |
||||
bool full() const { return mBatchSize == mCount; }; /**< Gets if the batch is full. */ |
||||
}; |
||||
|
||||
////////////////
|
||||
// cuBLAS API //
|
||||
////////////////
|
||||
|
||||
template <typename T, typename F1, typename F2, typename... Args> |
||||
constexpr void invoke(F1 f1, F2 f2, Args&&... args) { |
||||
if constexpr (std::is_same<T, float>::value) { |
||||
CUBLAS_CHECK(f1(args...)); |
||||
} else if constexpr (std::is_same<T, double>::value) { |
||||
CUBLAS_CHECK(f2(args...)); |
||||
} else { |
||||
CT_ERROR(true, "BLAS functions are not callable with that type"); |
||||
} |
||||
} |
||||
|
||||
/**
|
||||
* Computes the matrix-vector product: \f$ y = \alpha Ax + \beta y \f$. It will automatically |
||||
* broadcast the operation if applicable. |
||||
*/ |
||||
template <typename T> |
||||
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y, |
||||
const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||
|
||||
BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y"); |
||||
CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector"); |
||||
CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector"); |
||||
|
||||
uint32_t rows = A.shape().rows(); |
||||
uint32_t cols = A.shape().cols(); |
||||
T a = alpha, b = beta; |
||||
#ifdef CUDA |
||||
CUBLAS_CHECK( |
||||
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||
if (bi.size == 1) { |
||||
invoke<T>(cublasSgemv, cublasDgemv, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, |
||||
&a, A.dataDevice(), rows, x.dataDevice(), 1, &b, y.dataDevice(), 1); |
||||
|
||||
} else { // Greater than 2, so broadcast.
|
||||
invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, |
||||
Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, &a, A.dataDevice(), rows, |
||||
bi.strideA, x.dataDevice(), 1, bi.strideB, &b, y.dataDevice(), 1, bi.strideC, |
||||
bi.size); |
||||
} |
||||
|
||||
#else |
||||
if (bi.size == 1) { |
||||
y.eigenMap() = a * (A.eigenMap() * x.eigenMap()) + b * y.eigenMap(); |
||||
} else { // Greater than 2, so broadcast.
|
||||
#pragma omp parallel for |
||||
for (uint32_t i = 0; i < bi.size; ++i) { |
||||
auto Ai = Array<T>(A, {rows, cols}, i * bi.strideA).eigenMap(); |
||||
auto xi = Array<T>(x, {cols, 1}, i * bi.strideB).eigenMap(); |
||||
auto yi = Array<T>(y, {rows, 1}, i * bi.strideC).eigenMap(); |
||||
yi = a * (Ai * xi) + b * yi; |
||||
} |
||||
} |
||||
#endif |
||||
return StreamID{stream}; |
||||
} |
||||
|
||||
/**
|
||||
* Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically |
||||
* broadcast the operation if applicable. |
||||
*/ |
||||
template <typename T> |
||||
StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C, |
||||
const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||
|
||||
BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C"); |
||||
// A is m x k, B is k x n.
|
||||
uint32_t m = A.shape().rows(); |
||||
uint32_t k = A.shape().cols(); |
||||
uint32_t n = B.shape().cols(); |
||||
|
||||
T a = alpha, b = beta; |
||||
#ifdef CUDA |
||||
CUBLAS_CHECK( |
||||
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||
if (bi.size == 1) { |
||||
invoke<T>(cublasSgemm, cublasDgemm, Manager::get()->cublasHandle(), CUBLAS_OP_N, |
||||
CUBLAS_OP_N, m, n, k, &a, A.dataDevice(), m, B.dataDevice(), k, &b, |
||||
C.dataDevice(), m); |
||||
|
||||
} else { // Greater than 2, so broadcast.
|
||||
invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, |
||||
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &a, |
||||
A.dataDevice(), m, bi.strideA, B.dataDevice(), k, bi.strideB, &b, C.dataDevice(), |
||||
m, bi.strideC, bi.size); |
||||
} |
||||
|
||||
#else |
||||
if (bi.size == 1) { |
||||
C.eigenMap() = a * (A.eigenMap() * B.eigenMap()) + b * C.eigenMap(); |
||||
} else { // Greater than 2, so broadcast.
|
||||
#pragma omp parallel for |
||||
for (uint32_t i = 0; i < bi.size; ++i) { |
||||
auto Ai = Array<T>(A, {m, k}, i * bi.strideA).eigenMap(); |
||||
auto Bi = Array<T>(B, {k, n}, i * bi.strideB).eigenMap(); |
||||
auto Ci = Array<T>(C, {m, n}, i * bi.strideC).eigenMap(); |
||||
Ci = a * (Ai * Bi) + b * Ci; |
||||
} |
||||
} |
||||
#endif |
||||
return StreamID{stream}; |
||||
} |
||||
|
||||
/**
|
||||
* Computes the diagonal matrix multiplication: \f$ C = A\mathrm{diag}(X) \f$, or \f$ C = |
||||
* \mathrm{diag}(X)A \f$ if left = true. |
||||
*/ |
||||
template <typename T> |
||||
StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const bool left = false, |
||||
const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||
CT_ERROR_IF(X.shape().cols(), !=, 1, "'x' must be a column vector."); |
||||
if (left) { |
||||
CT_ERROR_IF(A.shape().rows(), !=, X.shape().rows(), |
||||
"Rows of 'A' and length of 'x' need to match."); |
||||
} else { |
||||
CT_ERROR_IF(A.shape().cols(), !=, X.shape().rows(), |
||||
"Columns of 'A' and length of 'x' need to match."); |
||||
} |
||||
CT_ERROR_IF(A.shape().rows(), !=, C.shape().rows(), |
||||
"Rows of 'A' and rows() of 'C' need to match."); |
||||
CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(), |
||||
"Rows of 'A' and columns of 'C' need to match."); |
||||
|
||||
#ifdef CUDA |
||||
uint32_t m = C.shape().rows(); |
||||
uint32_t n = C.shape().cols(); |
||||
auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; |
||||
CUBLAS_CHECK( |
||||
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||
invoke<T>(cublasSdgmm, cublasDdgmm, Manager::get()->cublasHandle(), m, n, A.dataDevice(), |
||||
A.shape().rows(), X.dataDevice(), 1, C.dataDevice(), m); |
||||
#else |
||||
if (left) { |
||||
C.eigenMap() = X.eigenMap().asDiagonal() * A.eigenMap(); |
||||
} else { |
||||
C.eigenMap() = A.eigenMap() * X.eigenMap().asDiagonal(); |
||||
} |
||||
#endif |
||||
return StreamID{stream}; |
||||
} |
||||
|
||||
//////////////////////////////
|
||||
// PLUArray Related Objects //
|
||||
//////////////////////////////
|
||||
|
||||
///////////////////////////
|
||||
// PartialPivLU Wrapper //
|
||||
///////////////////////////
|
||||
|
||||
// This class is just a workaround to use Eigen's internals directly.
|
||||
template <typename T> class PartialPivLU; |
||||
namespace internal { |
||||
template <typename T> static Array<T> empty({1, 1}); |
||||
template <typename T> static EigenMapMat<T> empty_map = empty<T>.eigenMap(); |
||||
}; // namespace internal
|
||||
|
||||
template <typename T, ENABLE_IF(IS_FLOAT(T)) = true> class PLUArray; |
||||
// This is a wrapper class for Eigen's class so we have more controlled access to
|
||||
// the underlying data.
|
||||
template <typename T> class PartialPivLU : public Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>> { |
||||
private: |
||||
using Base = Eigen::PartialPivLU<Eigen::Ref<EigenMat<T>>>; |
||||
template <typename U, ENABLE_IF(IS_FLOAT(U))> friend class PLUArray; |
||||
|
||||
EigenMapMat<T> mMapLU; |
||||
EigenMapMat<int32_t> mMapPivots; |
||||
|
||||
public: |
||||
PartialPivLU() |
||||
: Base(internal::empty_map<T>), mMapLU(internal::empty_map<T>), |
||||
mMapPivots(internal::empty_map<int32_t>){}; |
||||
|
||||
void make(const Array<T>& lu, const Array<int32_t>& pivots) { |
||||
|
||||
new (&mMapLU) EigenMapMat<T>(lu.eigenMap()); |
||||
new (&mMapPivots) EigenMapMat<int32_t>(pivots.atLeast2D().eigenMap()); |
||||
|
||||
new (&this->m_lu) decltype(Base::m_lu)(mMapLU.derived()); |
||||
new (&this->m_p) decltype(Base::m_p)(mMapPivots.derived()); |
||||
|
||||
// new (&this->m_rowsTranspositions) decltype(Base::m_rowsTranspositions)(
|
||||
// mMapPivots.derived());
|
||||
|
||||
this->m_l1_norm = 0; |
||||
this->m_det_p = 0; |
||||
this->m_isInitialized = true; |
||||
}; |
||||
}; |
||||
|
||||
namespace internal { |
||||
// We only create one and copy-construct to avoid the re-initialization.
|
||||
template <typename T> static PartialPivLU<T> BlankPPLU = PartialPivLU<T>(); |
||||
}; // namespace internal
|
||||
|
||||
/**
|
||||
* Class for storing the PLU decomposition an Array. This is restricted to floating point types. |
||||
*/ |
||||
template <typename T, ENABLE_IF(IS_FLOAT(T))> class PLUArray { |
||||
private: |
||||
Array<T> mLU; |
||||
Array<int32_t> mPivots; |
||||
PartialPivLU<T> mPPLU = internal::BlankPPLU<T>; |
||||
|
||||
public: |
||||
PLUArray() = delete; |
||||
|
||||
/**
|
||||
* Constructor for a PLUArray given the matrix dimension. |
||||
*/ |
||||
PLUArray(const uint32_t n) : mLU({n, n}), mPivots({n}) { mPPLU.make(mLU, mPivots); }; |
||||
|
||||
/**
|
||||
* Constructor for a PLUArray given an existing array. |
||||
*/ |
||||
PLUArray(const Array<T>& arr) |
||||
: mLU((arr.isView()) ? arr.view() : arr), mPivots({arr.shape().rows()}) { |
||||
CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); |
||||
CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); |
||||
mPPLU.make(mLU, mPivots); |
||||
}; |
||||
|
||||
/**
|
||||
* Constructor for a PLUArray given an existing location in memory for both the matrix and |
||||
* the pivots. |
||||
*/ |
||||
PLUArray(const Array<T>& arr, const Array<int32_t> pivots) |
||||
: mLU(arr.view()), mPivots(pivots.view()) { |
||||
CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); |
||||
CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); |
||||
mPPLU.make(mLU, mPivots); |
||||
}; |
||||
|
||||
uint32_t rank() { return mLU.shape().rows(); }; /**< Gets the rank of the LU matrix. */ |
||||
Array<T> LU() const { return mLU.view(); }; /**< Gets the LU matrix. */ |
||||
Array<int32_t> pivots() const { return mPivots.view(); }; /**< Gets the LU matrix. */ |
||||
|
||||
/**
|
||||
* Comptues the inplace LU factorization for this array on CPU. |
||||
*/ |
||||
void computeLU() { |
||||
mPPLU.compute(); |
||||
mPPLU.mMapPivots = mPPLU.permutationP().indices(); |
||||
}; |
||||
|
||||
/**
|
||||
* Solves the system \f$ LUx = b \f$ and returns \f$x\f$. |
||||
*/ |
||||
Array<T> solve(const Array<T>& b) { |
||||
Array<T> x(b.shape()); |
||||
x.eigenMap() = mPPLU.solve(b.eigenMap()); |
||||
return x; |
||||
}; |
||||
}; |
||||
|
||||
/**
|
||||
* This is a batch version of PLUArray, to enable usage of the cuBLAS API. This is restricted to |
||||
* floating point types. |
||||
*/ |
||||
template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true> |
||||
class PLUBatch : public Batch<T> { |
||||
private: |
||||
Array<int32_t> mPivotsBatch; |
||||
Array<int32_t> mInfoLU; |
||||
int32_t mInfoSolve; |
||||
|
||||
bool mInitialized = false; |
||||
|
||||
public: |
||||
/**
|
||||
* Constructor of a PLUBatch from a given batch size. |
||||
*/ |
||||
PLUBatch(const uint32_t size) : Batch<T>(size), mInfoLU({size}){}; |
||||
|
||||
/**
|
||||
* Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions. |
||||
*/ |
||||
PLUBatch(const Array<T>& arr) : Batch<T>(arr) { |
||||
Check<T>::isSquare(arr, "LU Array"); |
||||
|
||||
mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()}); |
||||
mInfoLU = Array<int32_t>({this->mBatchSize}); |
||||
}; |
||||
|
||||
/**
|
||||
* Indexing operator which returns the PLUArray in the PLUBatch at the given index. |
||||
*/ |
||||
PLUArray<T> operator[](const uint32_t index) const { |
||||
CT_ERROR_IF(index, >=, this->mBatchSize, "Index exceeds batch size"); |
||||
Array<T> lu(this->mBatch[index], {this->mShape.rows(), this->mShape.cols()}); |
||||
Array<int32_t> pivots(mPivotsBatch.data() + index * this->mShape.rows(), |
||||
{this->mShape.rows()}); |
||||
return PLUArray<T>(lu, pivots); |
||||
}; |
||||
|
||||
/**
|
||||
* Computes the inplace PLU decomposition of batch of arrays. |
||||
*/ |
||||
StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||
#ifdef CUDA |
||||
uint32_t n = this->mShape.rows(); |
||||
CUBLAS_CHECK( |
||||
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||
invoke<T>(cublasSgetrfBatched, cublasDgetrfBatched, Manager::get()->cublasHandle(), n, |
||||
this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), mInfoLU.dataDevice(), |
||||
this->mBatchSize); |
||||
|
||||
#else |
||||
#pragma omp parallel for |
||||
for (uint32_t i = 0; i < this->mBatchSize; ++i) { |
||||
(*this)[i].computeLU(); |
||||
} |
||||
#endif |
||||
mInitialized = true; |
||||
return stream; |
||||
}; |
||||
|
||||
/**
|
||||
* Solves the batched system \f$LUx = b\f$ inplace. The solution \f$x\f$ is written back into |
||||
* \f$b\f$. |
||||
*/ |
||||
StreamID solve(const Batch<T>& b, const StreamID& stream = DEF_CUBLAS_STREAM) { |
||||
CT_ERROR(not mInitialized, |
||||
"Cannot solve system if PLUBatch has not yet computed its LU decomposition"); |
||||
CT_ERROR_IF(b.size(), !=, this->mBatchSize, |
||||
"Upper dimensions of b do not match batch size"); |
||||
CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(), |
||||
"The length of each column of b must match the matrix rank"); |
||||
|
||||
#ifdef CUDA |
||||
uint32_t n = b.shape().rows(); |
||||
uint32_t nrhs = b.shape().cols(); |
||||
CUBLAS_CHECK( |
||||
cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); |
||||
invoke<T>(cublasSgetrsBatched, cublasDgetrsBatched, Manager::get()->cublasHandle(), |
||||
CUBLAS_OP_N, n, nrhs, this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), |
||||
b.batch().dataDevice(), n, &mInfoSolve, this->mBatchSize); |
||||
|
||||
#else |
||||
#pragma omp parallel for |
||||
for (uint32_t i = 0; i < this->mBatchSize; ++i) { |
||||
b[i] = (*this)[i].solve(b[i]); |
||||
} |
||||
#endif |
||||
return stream; |
||||
}; |
||||
|
||||
/**
|
||||
* Gets the pivots data from the device to the host. Does nothing for CPU. |
||||
*/ |
||||
StreamID getPivots(const StreamID& stream = DEF_MEM_STREAM) const { |
||||
mPivotsBatch.updateHost(stream); |
||||
return stream; |
||||
}; |
||||
|
||||
/**
|
||||
* Gets the info array for the LU decomposition for the device to the host. Does not |
||||
* return useful information for CPU. |
||||
*/ |
||||
Array<int32_t> getLUInfo() const { |
||||
mInfoLU.updateHost().wait(); |
||||
return mInfoLU; |
||||
}; |
||||
|
||||
/**
|
||||
* Checks validity of the solve operation. Does not return useful information for CPU. |
||||
*/ |
||||
int32_t validSolve() const { return mInfoSolve == 0; } |
||||
}; |
||||
|
||||
// /**
|
||||
// * Gets the inverse of each A[i], using an already PLU factorized A[i].
|
||||
// * Only available if compiling with CUDA.
|
||||
// */
|
||||
// template <typename T>
|
||||
// void inverseBatch(const Array<T*>& batchA, const Array<T*>& batchC, const Array<int>&
|
||||
// pivots,
|
||||
// const Array<int>& info, const Shape shapeA, const Shape shapeC,
|
||||
// const uint stream = 0) {
|
||||
// #ifdef CUDA
|
||||
// CT_ERROR_IF(shapeA.rows(), !=, shapeA.cols(),
|
||||
// "'A' needs to be square, rows() and column need to match.");
|
||||
// CT_ERROR_IF(shapeA.rows(), !=, shapeC.cols(), "'A' needs to be the same shape as
|
||||
// 'C'."); CT_ERROR_IF(shapeA.rows(), !=, shapeC.rows(), "'A' needs to be the same shape
|
||||
// as 'C'.");
|
||||
|
||||
// CT_ERROR_IF(shapeA.rows(), !=, pivots.shape().rows(),
|
||||
// "Rows()/columns of 'A' and rows() of pivots need to match.");
|
||||
// CT_ERROR_IF(batchA.shape().rows(), !=, pivots.shape().cols(),
|
||||
// "Batch size and columns of pivots need to match.");
|
||||
// CT_ERROR_IF(info.shape().cols(), !=, 1, "Info needs to be a column vector.")
|
||||
// CT_ERROR_IF(batchA.shape().rows(), !=, info.shape().rows(),
|
||||
// "Batch size and length of info need to match.");
|
||||
// CT_ERROR_IF(batchA.shape().rows(), !=, batchC.shape().rows(),
|
||||
// "Batches 'A[i]' and 'C[i]' need to match.");
|
||||
|
||||
// std::string s = "cublas" + std::to_string(stream);
|
||||
// CUBLAS_CHECK(
|
||||
// cublasSetStream(Manager::get()->cublasHandle(),
|
||||
// Manager::get()->stream(s)));
|
||||
// invoke<T>(cublasSgetriBatched, cublasDgetriBatched,
|
||||
// Manager::get()->cublasHandle(),
|
||||
// shapeA.rows(), batchA.dataDevice(), shapeA.rows(), pivots.dataDevice(),
|
||||
// batchC.dataDevice(), shapeC.rows(), info.dataDevice(),
|
||||
// batchA.shape().rows());
|
||||
// #else
|
||||
// CT_ERROR_IF(true, ==, true, "inverseBatch is not callable without CUDA.");
|
||||
// #endif
|
||||
// }
|
||||
|
||||
}; // namespace BLAS
|
||||
}; // namespace CudaTools
|
||||
|
||||
#endif |
@ -0,0 +1,544 @@ |
||||
#ifndef CUDATOOLS_H |
||||
#define CUDATOOLS_H |
||||
|
||||
#include "Macros.h" |
||||
#include <iostream> |
||||
#include <string> |
||||
#include <unordered_map> |
||||
#include <vector> |
||||
|
||||
namespace CudaTools { |
||||
|
||||
/**
|
||||
* Simple wrapper for the name of a stream. Its purposes is to allow for |
||||
* 'streams' to be passed on host code, and allowing for simple syntax |
||||
* for waiting. |
||||
*/ |
||||
struct StreamID { |
||||
public: |
||||
std::string id; |
||||
StreamID() : id(""){}; |
||||
/**
|
||||
* The constructor for a StreamID. |
||||
*/ |
||||
StreamID(const std::string& id_) : id(id_){}; |
||||
StreamID(const char* id_) : id(id_){}; |
||||
|
||||
/**
|
||||
* Waits for the stream with this stream ID. |
||||
*/ |
||||
void wait() const; |
||||
}; |
||||
|
||||
static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"}; |
||||
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"}; |
||||
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"}; |
||||
|
||||
/**
|
||||
* Allocates memory on the device. |
||||
*/ |
||||
void* malloc(const size_t size); |
||||
|
||||
/**
|
||||
* Pins memory on the host. |
||||
*/ |
||||
void pin(void* const pHost, const size_t size); |
||||
|
||||
/**
|
||||
* Pushes memory from the device to the host. |
||||
*/ |
||||
StreamID push(void* const pHost, void* const pDevice, const size_t size, |
||||
const StreamID& stream = DEF_MEM_STREAM); |
||||
/**
|
||||
* Pulls memory from the device back to the host. |
||||
*/ |
||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size, |
||||
const StreamID& stream = DEF_MEM_STREAM); |
||||
/**
|
||||
* Copies memory on the device to another location on the device. |
||||
*/ |
||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, |
||||
const StreamID& stream = DEF_MEM_STREAM); |
||||
|
||||
/**
|
||||
* Frees memory on the device. |
||||
*/ |
||||
void free(void* const pDevice); |
||||
|
||||
#ifdef CUDACC |
||||
cudaDeviceProp getDeviceProp(); |
||||
static cudaDeviceProp DeviceProperties = getDeviceProp(); |
||||
const char* cublasGetErrorString(cublasStatus_t status); |
||||
#endif |
||||
|
||||
/**
|
||||
* A class that manages various CUDA Runtime components, such as |
||||
* streams, events, and handles. |
||||
*/ |
||||
class Manager { |
||||
private: |
||||
static Manager mManagerInstance; |
||||
Manager(const std::vector<std::string>& names); |
||||
~Manager(); |
||||
#ifdef CUDACC |
||||
std::unordered_map<std::string, cudaStream_t> mStreams; |
||||
cublasHandle_t mCublas; |
||||
#endif |
||||
public: |
||||
/**
|
||||
* Used to get the global CudaTools::Manager instance. |
||||
*/ |
||||
static Manager* get() { return &mManagerInstance; }; |
||||
|
||||
void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */ |
||||
void sync() const; /**< Waits until all device code has finished. */ |
||||
void addStream(const std::string& name); /**< Creates a stream with the given name. */ |
||||
#ifdef CUDACC |
||||
cudaStream_t stream(const StreamID& stream) const; |
||||
cublasHandle_t cublasHandle() const; |
||||
#endif |
||||
}; |
||||
|
||||
namespace Kernel { |
||||
|
||||
/**
|
||||
* A struct that contains the kernel launch parameters. |
||||
*/ |
||||
struct Settings { |
||||
public: |
||||
#ifdef CUDACC |
||||
dim3 blockGrid; |
||||
dim3 threadBlock; |
||||
size_t sharedMemoryBytes = 0; |
||||
#else |
||||
size_t threads; |
||||
#endif |
||||
StreamID stream; |
||||
|
||||
Settings() = default; |
||||
|
||||
void setGridDim(const size_t x); /**< Sets the Grid dimensions. */ |
||||
void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */ |
||||
void setGridDim(const size_t x, const size_t y, |
||||
const size_t z); /**< Sets the Grid dimensions. */ |
||||
void setBlockDim(const size_t x); /**< Sets the Thread Block dimensions. */ |
||||
void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */ |
||||
void setBlockDim(const size_t x, const size_t y, |
||||
const size_t z); /**< Sets the Thread Block dimensions. */ |
||||
|
||||
void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */ |
||||
void setStream(const StreamID& stream); /**< Sets the stream. */ |
||||
}; |
||||
|
||||
/**
|
||||
* Returns a kernel launch parameters based on the number of threads, and optionally |
||||
* a stream. Should only be used for 'embarassingly parallel' situations, or where |
||||
* each thread corresponds some sort of index. |
||||
*/ |
||||
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM); |
||||
|
||||
}; // namespace Kernel
|
||||
|
||||
template <typename T> class Array; |
||||
|
||||
/**
|
||||
* A class that holds information about an Array. |
||||
*/ |
||||
class Shape { |
||||
private: |
||||
template <typename T> friend class Array; |
||||
uint32_t mAxes; |
||||
uint32_t mItems; |
||||
uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0}; |
||||
uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0}; |
||||
|
||||
public: |
||||
HD Shape() : mAxes(0), mItems(1){}; |
||||
/**
|
||||
* The constructor for a Shape. |
||||
* \param dims an initializer list of the dimensions. |
||||
*/ |
||||
HD Shape(const std::initializer_list<uint32_t> dims); |
||||
|
||||
HD uint32_t axes() const; /**< Gets the number of axes. */ |
||||
HD uint32_t items() const; /**< Gets the total number of items. */ |
||||
|
||||
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
|
||||
of the last axis. */ |
||||
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
|
||||
dimension of the second to last axis. */ |
||||
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets the
|
||||
dimension of the second to last axis. */ |
||||
|
||||
HD uint32_t |
||||
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */ |
||||
HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */ |
||||
|
||||
/**
|
||||
* Gets the shape at a specific axis of this shape. |
||||
* \param axis the axis of where the new shape starts. |
||||
*/ |
||||
HD Shape subshape(const uint32_t axis) const; |
||||
|
||||
HD bool operator==(const Shape& s) const; /**< Equals operator. */ |
||||
HD bool operator!=(const Shape& s) const; /**< Not equals operator. */ |
||||
}; |
||||
|
||||
std::ostream& operator<<(std::ostream& out, const Shape& s); |
||||
|
||||
}; // namespace CudaTools
|
||||
|
||||
#ifdef CUDATOOLS_IMPLEMENTATION |
||||
|
||||
namespace CudaTools { |
||||
|
||||
template <typename T, typename... Args> |
||||
StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) { |
||||
#ifdef CUDA |
||||
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes, |
||||
Manager::get()->stream(sett.stream.id)>>>(args...); |
||||
#else |
||||
func(args...); |
||||
#endif |
||||
return sett.stream; |
||||
} |
||||
|
||||
////////////////////
|
||||
// Memory Methods //
|
||||
////////////////////
|
||||
|
||||
void StreamID::wait() const { Manager::get()->waitFor(id); } |
||||
|
||||
void* malloc(const size_t size) { |
||||
#ifdef CUDACC |
||||
void* pDevice; |
||||
CUDA_CHECK(cudaMalloc(&pDevice, size)); |
||||
return pDevice; |
||||
#else |
||||
return nullptr; |
||||
#endif |
||||
} |
||||
|
||||
void free(void* const pDevice) { |
||||
#ifdef CUDACC |
||||
if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice)); |
||||
#endif |
||||
} |
||||
|
||||
StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { |
||||
#ifdef CUDACC |
||||
CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice, |
||||
Manager::get()->stream(stream.id))); |
||||
#endif |
||||
return stream; |
||||
} |
||||
|
||||
StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { |
||||
#ifdef CUDACC |
||||
CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost, |
||||
Manager::get()->stream(stream.id))); |
||||
#endif |
||||
return stream; |
||||
} |
||||
|
||||
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, |
||||
const StreamID& stream) { |
||||
#ifdef CUDACC |
||||
CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice, |
||||
Manager::get()->stream(stream.id))); |
||||
#endif |
||||
return stream; |
||||
} |
||||
|
||||
void pin(void* const pHost, const size_t size) { |
||||
#ifdef CUDACC |
||||
CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault)); |
||||
#endif |
||||
} |
||||
|
||||
#ifdef CUDACC |
||||
cudaDeviceProp getDeviceProp() { |
||||
cudaSetDevice(0); |
||||
cudaDeviceProp deviceProp; |
||||
cudaGetDeviceProperties(&deviceProp, 0); |
||||
return deviceProp; |
||||
} |
||||
#endif |
||||
|
||||
/////////////////////
|
||||
// Manager Methods //
|
||||
/////////////////////
|
||||
|
||||
Manager::Manager(const std::vector<std::string>& names) { |
||||
#ifdef CUDACC |
||||
for (auto name : names) { |
||||
addStream(name); |
||||
} |
||||
CUBLAS_CHECK(cublasCreate(&mCublas)); |
||||
#endif |
||||
} |
||||
|
||||
Manager::~Manager() { |
||||
#ifdef CUDACC |
||||
for (auto& it : mStreams) { |
||||
CUDA_CHECK(cudaStreamDestroy(it.second)); |
||||
} |
||||
CUBLAS_CHECK(cublasDestroy(mCublas)); |
||||
#endif |
||||
} |
||||
|
||||
void Manager::waitFor(const StreamID& stream) const { |
||||
#ifdef CUDACC |
||||
auto it = mStreams.find(stream.id); |
||||
if (it != mStreams.end()) { |
||||
CUDA_CHECK(cudaStreamSynchronize(it->second)); |
||||
} else { |
||||
CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
void Manager::sync() const { |
||||
#ifdef CUDACC |
||||
CUDA_CHECK(cudaDeviceSynchronize()); |
||||
#endif |
||||
} |
||||
|
||||
void Manager::addStream(const std::string& name) { |
||||
#ifdef CUDACC |
||||
cudaStream_t s; |
||||
CUDA_CHECK(cudaStreamCreate(&s)); |
||||
mStreams[name] = s; |
||||
#endif |
||||
} |
||||
|
||||
#ifdef CUDACC |
||||
cudaStream_t Manager::stream(const StreamID& stream) const { |
||||
auto it = mStreams.find(stream.id); |
||||
if (it != mStreams.end()) { |
||||
return it->second; |
||||
} else { |
||||
CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); |
||||
} |
||||
} |
||||
|
||||
cublasHandle_t Manager::cublasHandle() const { return mCublas; }; |
||||
|
||||
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"}); |
||||
#else |
||||
Manager Manager::mManagerInstance = Manager({""}); |
||||
#endif |
||||
|
||||
////////////////////
|
||||
// Kernel Methods //
|
||||
////////////////////
|
||||
|
||||
namespace Kernel { |
||||
|
||||
void Settings::setGridDim(const size_t x) { |
||||
#ifdef CUDACC |
||||
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large") |
||||
blockGrid.x = x; |
||||
blockGrid.y = 1; |
||||
blockGrid.z = 1; |
||||
#endif |
||||
} |
||||
|
||||
void Settings::setGridDim(const size_t x, const size_t y) { |
||||
#ifdef CUDACC |
||||
CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); |
||||
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); |
||||
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); |
||||
blockGrid.x = x; |
||||
blockGrid.y = y; |
||||
blockGrid.z = 1; |
||||
#endif |
||||
} |
||||
|
||||
void Settings::setGridDim(const size_t x, const size_t y, const size_t z) { |
||||
#ifdef CUDACC |
||||
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); |
||||
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); |
||||
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); |
||||
CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large."); |
||||
blockGrid.x = x; |
||||
blockGrid.y = y; |
||||
blockGrid.z = z; |
||||
#endif |
||||
} |
||||
|
||||
void Settings::setBlockDim(const size_t x) { |
||||
#ifdef CUDACC |
||||
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); |
||||
threadBlock.x = x; |
||||
threadBlock.y = 1; |
||||
threadBlock.z = 1; |
||||
#endif |
||||
} |
||||
|
||||
void Settings::setBlockDim(const size_t x, const size_t y) { |
||||
#ifdef CUDACC |
||||
CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); |
||||
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); |
||||
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); |
||||
threadBlock.x = x; |
||||
threadBlock.y = y; |
||||
threadBlock.z = 1; |
||||
#endif |
||||
} |
||||
|
||||
void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) { |
||||
#ifdef CUDACC |
||||
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); |
||||
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); |
||||
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); |
||||
CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large."); |
||||
threadBlock.x = x; |
||||
threadBlock.y = y; |
||||
threadBlock.z = z; |
||||
#endif |
||||
} |
||||
|
||||
void Settings::setSharedMemSize(const size_t bytes) { |
||||
#ifdef CUDACC |
||||
sharedMemoryBytes = bytes; |
||||
#endif |
||||
} |
||||
|
||||
void Settings::setStream(const StreamID& stream_) { |
||||
#ifdef CUDACC |
||||
stream.id = stream_.id; |
||||
#endif |
||||
} |
||||
|
||||
Settings basic(const size_t threads, const StreamID& stream) { |
||||
Settings sett; |
||||
#ifdef CUDACC |
||||
auto max_threads = DeviceProperties.maxThreadsPerBlock; |
||||
size_t grid_blocks = (threads + max_threads - 1) / max_threads; // ceil(threads / max_threads)
|
||||
size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
|
||||
sett.setGridDim(grid_blocks); |
||||
sett.setBlockDim(block_threads); |
||||
sett.setStream(stream); |
||||
#else |
||||
sett.threads = threads; |
||||
#endif |
||||
return sett; |
||||
} |
||||
} // namespace Kernel
|
||||
|
||||
/////////////////////
|
||||
// Shape Functions //
|
||||
/////////////////////
|
||||
|
||||
HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) { |
||||
CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes"); |
||||
mAxes = dims.size(); |
||||
if (mAxes == 0) return; |
||||
|
||||
auto it = dims.end() - 1; |
||||
mItems = 1; |
||||
for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) { |
||||
uint32_t dim = *it; |
||||
CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0"); |
||||
|
||||
mAxisDim[iAxis] = dim; |
||||
mStride[iAxis] = mItems; |
||||
mItems *= dim; |
||||
--it; |
||||
} |
||||
|
||||
if (mAxes == 1) return; |
||||
// Swap last two, for column major storage.
|
||||
mStride[mAxes - 2] = 1; |
||||
mStride[mAxes - 1] = mAxisDim[mAxes - 2]; |
||||
} |
||||
|
||||
HD uint32_t Shape::axes() const { return mAxes; }; |
||||
HD uint32_t Shape::items() const { return mItems; }; |
||||
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; } |
||||
|
||||
HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; } |
||||
|
||||
HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; } |
||||
|
||||
HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; } |
||||
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; } |
||||
|
||||
HD bool Shape::operator==(const Shape& s) const { |
||||
if (mAxes != s.mAxes) { |
||||
return false; |
||||
} |
||||
for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) { |
||||
if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) { |
||||
return false; |
||||
} |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); } |
||||
|
||||
HD Shape Shape::subshape(const uint32_t axis) const { |
||||
CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes."); |
||||
if (axis == mAxes) return Shape({1}); |
||||
|
||||
Shape new_shape({}); |
||||
new_shape.mAxes = mAxes - axis; |
||||
new_shape.mItems = mItems; |
||||
|
||||
for (uint32_t iAxis = 0; iAxis < axis; iAxis++) { |
||||
new_shape.mItems /= mAxisDim[iAxis]; |
||||
} |
||||
for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) { |
||||
new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis]; |
||||
new_shape.mStride[iAxis - axis] = mStride[iAxis]; |
||||
} |
||||
return new_shape; |
||||
} |
||||
|
||||
std::ostream& operator<<(std::ostream& out, const Shape& s) { |
||||
out << "("; |
||||
if (s.axes() == 0) return out << ")"; |
||||
for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) { |
||||
out << s.dim(iAxis) << ", "; |
||||
} |
||||
return out << s.dim(s.axes() - 1) << ")"; |
||||
} |
||||
|
||||
#ifdef CUDACC |
||||
const char* cublasGetErrorString(cublasStatus_t error) { |
||||
switch (error) { |
||||
case CUBLAS_STATUS_SUCCESS: |
||||
return "CUBLAS_STATUS_SUCCESS"; |
||||
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED: |
||||
return "CUBLAS_STATUS_NOT_INITIALIZED"; |
||||
|
||||
case CUBLAS_STATUS_ALLOC_FAILED: |
||||
return "CUBLAS_STATUS_ALLOC_FAILED"; |
||||
|
||||
case CUBLAS_STATUS_INVALID_VALUE: |
||||
return "CUBLAS_STATUS_INVALID_VALUE"; |
||||
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH: |
||||
return "CUBLAS_STATUS_ARCH_MISMATCH"; |
||||
|
||||
case CUBLAS_STATUS_MAPPING_ERROR: |
||||
return "CUBLAS_STATUS_MAPPING_ERROR"; |
||||
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED: |
||||
return "CUBLAS_STATUS_EXECUTION_FAILED"; |
||||
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR: |
||||
return "CUBLAS_STATUS_INTERNAL_ERROR"; |
||||
} |
||||
|
||||
return "<unknown>"; |
||||
} |
||||
#endif |
||||
|
||||
}; // namespace CudaTools
|
||||
#endif // CUDATOOLS_IMPLEMENTATION
|
||||
|
||||
#endif // CUDATOOLS_H
|
@ -0,0 +1,297 @@ |
||||
#ifndef MACROS_H |
||||
#define MACROS_H |
||||
|
||||
#include <exception> |
||||
#include <sstream> |
||||
#include <stdarg.h> |
||||
|
||||
#if defined(CUDA) && defined(__CUDACC__) |
||||
#define CUDACC |
||||
#endif |
||||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) |
||||
#define DEVICE |
||||
#endif |
||||
|
||||
#ifdef CUDATOOLS_DOXYGEN |
||||
/**
|
||||
* \def CUDACC |
||||
* This macro is defined when this code is being compiled by nvcc and the CUDA compilation |
||||
* flag is set. This should be used to enclose code where CUDA specific libraries and syntax are |
||||
* being used. |
||||
*/ |
||||
#define CUDACC |
||||
|
||||
/**
|
||||
* \def DEVICE |
||||
* This macro is defined when this code is being compiled for the device. The difference between |
||||
* this and CUDACC is that this should exclusively be used to dcide if code is being compiled |
||||
* to execute on the device. CUDACC is only determines what compiler is being used. |
||||
*/ |
||||
#define DEVICE |
||||
|
||||
/**
|
||||
* \def HD |
||||
* Mark a function in front with this if it needs to be callable on both the |
||||
* CPU and CUDA device. |
||||
*/ |
||||
#define HD |
||||
|
||||
/**
|
||||
* \def SHARED |
||||
* Mark a variable as static shared memory. |
||||
*/ |
||||
#define SHARED |
||||
|
||||
/**
|
||||
* \def DECLARE_KERNEL(call, ...) |
||||
* Used to declare (in header) a CUDA kernel. |
||||
* \param call the name of the kernel |
||||
* \param ... the arguments of the kernel |
||||
*/ |
||||
#define DECLARE_KERNEL(call, ...) |
||||
|
||||
/**
|
||||
* \def DEFINE_KERNEL(call, ...) |
||||
* Used to define (in implementation) a CUDA kernel. |
||||
* \param call the name of the kernel |
||||
* \param ... the arguments of the kernel |
||||
*/ |
||||
#define DEFINE_KERNEL(call, ...) |
||||
|
||||
/**
|
||||
* \def KERNEL(call, settings, ...) |
||||
* Used to call a CUDA kernel. |
||||
* \param call the name of the kernel |
||||
* \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with |
||||
* \param ... the arguments of the kernel |
||||
*/ |
||||
#define KERNEL(call, settings, ...) |
||||
|
||||
/**
|
||||
* \def BASIC_LOOP(N) |
||||
* Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly |
||||
* parallel situations. Exposes the loop/thread number as iThread. |
||||
* \param N number of iterations |
||||
*/ |
||||
#define BASIC_LOOP(N) |
||||
|
||||
/**
|
||||
* \def DEVICE_CLASS(name) |
||||
* Can be used inside a class declaration (header) which generates boilerplate code to allow this |
||||
* class to be used on the device. |
||||
* |
||||
* This macro creates a few functions:\n |
||||
* name* that(): returns the pointer to this instance on the device. |
||||
* |
||||
* void allocateDevice(): allocates the memory on the device for this class instance. |
||||
* |
||||
* CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance |
||||
* of the class. |
||||
* |
||||
* CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates |
||||
* the device instance of the class. |
||||
* \param name the name of the class |
||||
*/ |
||||
#define DEVICE_CLASS(name) |
||||
|
||||
/**
|
||||
* \def CT_ERROR_IF(a, op, b, msg) |
||||
* Used for throwing runtime errors given a condition with an operator. |
||||
*/ |
||||
#define CT_ERROR_IF(a, op, b, msg) |
||||
|
||||
/**
|
||||
* \def CT_ERROR(a, msg) |
||||
* Used for throwing runtime errors given a bool. |
||||
*/ |
||||
#define CT_ERROR(a, msg) |
||||
|
||||
/**
|
||||
* \def CUDA_CHECK(call) |
||||
* Gets the error generated by a CUDA function call if there is one. |
||||
* \param call CUDA function to check if there are errors when running. |
||||
*/ |
||||
#define CUDA_CHECK(call) |
||||
|
||||
/**
|
||||
* \def CUBLAS_CHECK(call) |
||||
* Gets the error generated by a cuBLAS function call if there is one. |
||||
* \param call cuBLAS function to check if there are errors when running. |
||||
*/ |
||||
#define CUBLAS_CHECK(call) |
||||
|
||||
/**
|
||||
* \def CUDA_MEM(call) |
||||
* Gets the GPU memory used from function call if there is one. |
||||
* \param call function to measure memory usage. |
||||
* \param name an identifier to use as a variable and when printing. Must satisfy variable naming. |
||||
*/ |
||||
#define CUDA_MEM(call, name) |
||||
#endif |
||||
|
||||
///////////////////
|
||||
// KERNEL MACROS //
|
||||
///////////////////
|
||||
|
||||
#ifdef CUDACC |
||||
|
||||
#include <cublas_v2.h> |
||||
#include <cuda_runtime.h> |
||||
|
||||
#define HD __host__ __device__ |
||||
#define SHARED __shared__ |
||||
|
||||
#define DECLARE_KERNEL(call, ...) __global__ void call(__VA_ARGS__) |
||||
|
||||
#define DEFINE_KERNEL(call, ...) \ |
||||
template CudaTools::StreamID CudaTools::runKernel( \
|
||||
void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__); \
|
||||
__global__ void call(__VA_ARGS__) |
||||
|
||||
#else |
||||
#define HD |
||||
#define SHARED |
||||
|
||||
#define DECLARE_KERNEL(call, ...) void call(__VA_ARGS__) |
||||
|
||||
#define DEFINE_KERNEL(call, ...) \ |
||||
template CudaTools::StreamID CudaTools::runKernel( \
|
||||
void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__); \
|
||||
void call(__VA_ARGS__) |
||||
|
||||
#endif // CUDACC
|
||||
|
||||
#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__) |
||||
|
||||
///////////////////
|
||||
// DEVICE MACROS //
|
||||
///////////////////
|
||||
|
||||
#ifdef DEVICE |
||||
|
||||
#define BASIC_LOOP(N) \ |
||||
uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x; \
|
||||
if (iThread < N) |
||||
#else |
||||
#define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread) |
||||
|
||||
#endif |
||||
|
||||
//////////////////
|
||||
// CLASS MACROS //
|
||||
//////////////////
|
||||
|
||||
#define UPDATE_FUNC(name) \ |
||||
inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \
|
||||
CudaTools::DEF_MEM_STREAM) { \
|
||||
return CudaTools::pull(this, that(), sizeof(name)); \
|
||||
}; \
|
||||
inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \
|
||||
CudaTools::DEF_MEM_STREAM) { \
|
||||
return CudaTools::push(this, that(), sizeof(name)); \
|
||||
} |
||||
|
||||
#ifdef CUDA |
||||
|
||||
#define DEVICE_CLASS(name) \ |
||||
private: \
|
||||
name* __deviceInstance__ = nullptr; \
|
||||
\
|
||||
public: \
|
||||
inline name* that() { return __deviceInstance__; } \
|
||||
inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \
|
||||
UPDATE_FUNC(name) |
||||
|
||||
#else |
||||
#define DEVICE_CLASS(name) \ |
||||
public: \
|
||||
inline name* that() { return this; }; \
|
||||
inline void allocateDevice(){}; \
|
||||
UPDATE_FUNC(name) |
||||
|
||||
#endif |
||||
|
||||
#ifndef CUDATOOLS_ARRAY_MAX_AXES |
||||
/**
|
||||
* \def CUDATOOLS_ARRAY_MAX_AXES |
||||
* The maximum number of axes/dimensions an CudaTools::Array can have. The default is |
||||
* set to 4, but can be manully set fit the program needs. |
||||
*/ |
||||
#define CUDATOOLS_ARRAY_MAX_AXES 4 |
||||
#endif |
||||
|
||||
////////////////////
|
||||
// Error Checking //
|
||||
////////////////////
|
||||
|
||||
#ifndef NO_DIMENSION_CHECK |
||||
#ifdef DEVICE |
||||
#define CT_ERROR_IF(a, op, b, msg) \ |
||||
if (a op b) { \
|
||||
printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg); \
|
||||
} |
||||
|
||||
#define CT_ERROR(a, msg) \ |
||||
if (a) { \
|
||||
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||
} |
||||
#else |
||||
|
||||
#define CT_ERROR_IF(a, op, b, msg) \ |
||||
if (a op b) { \
|
||||
std::ostringstream os_a; \
|
||||
std::ostringstream os_b; \
|
||||
os_a << a; \
|
||||
os_b << b; \
|
||||
printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg, \
|
||||
os_a.str().c_str(), os_b.str().c_str()); \
|
||||
throw std::exception(); \
|
||||
} |
||||
|
||||
#define CT_ERROR(a, msg) \ |
||||
if (a) { \
|
||||
printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \
|
||||
throw std::exception(); \
|
||||
} |
||||
#endif |
||||
|
||||
#endif // NO_DIMENSION_CHECK
|
||||
|
||||
#if defined(CUDACC) && !defined(NO_CUDA_CHECK) |
||||
|
||||
#define CUDA_CHECK(call) \ |
||||
do { \
|
||||
cudaError_t err = (call); \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
throw std::exception(); \
|
||||
} \
|
||||
} while (0) |
||||
|
||||
#define CUBLAS_CHECK(call) \ |
||||
do { \
|
||||
cublasStatus_t err = (call); \
|
||||
if (err != CUBLAS_STATUS_SUCCESS) { \
|
||||
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \
|
||||
CudaTools::cublasGetErrorString(err)); \
|
||||
throw std::exception(); \
|
||||
} \
|
||||
} while (0) |
||||
|
||||
#define CUDA_MEM(call, name) \ |
||||
size_t free_bef_##name, free_aft_##name; \
|
||||
cudaMemGetInfo(&free_bef_##name, NULL); \
|
||||
call; \
|
||||
CudaTools::Manager::get()->sync(); \
|
||||
cudaMemGetInfo(&free_aft_##name, NULL); \
|
||||
printf("[%s] GPU Memory Usage: %iMiB\n", #name, \
|
||||
(free_bef_##name - free_aft_##name) / (1024 * 1024)); |
||||
|
||||
#else |
||||
#define CUDA_CHECK(call) (call) |
||||
#define CUBLAS_CHECK(call) (call) |
||||
#define CUDA_MEM(call, name) (call) |
||||
#endif |
||||
|
||||
#endif // MACROS_H
|
@ -0,0 +1,95 @@ |
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
|
||||
INCLUDE :=
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
|
||||
TARGET = tests
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
|
||||
# Should not need to modify below.
|
||||
|
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean |
||||
|
||||
all : cpu gpu |
||||
|
||||
cpu: $(TARGET)CPU |
||||
gpu: $(TARGET)GPU |
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ) |
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(NVCC) --device-link $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS) |
||||
-include $(GPU_DEPS) |
||||
|
||||
$(CPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
clean: |
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,40 @@ |
||||
========= |
||||
CudaTools |
||||
========= |
||||
This is the documentation for CudaTools, a header-only library and framework |
||||
for the development of CPU-CUDA compatible applications. Using CudaTools enables |
||||
the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to |
||||
introduce ``#ifdef`` statements when code is essentially identical between the targets. |
||||
|
||||
For information on the library itself and its usage, view `documentation <https://acem.ece.illinois.edu/code/CudaTools>`__. The small code snippets and samples |
||||
seen in the documentation are in the folder ``samples``. |
||||
|
||||
Dependencies |
||||
============ |
||||
- Eigen |
||||
|
||||
In the future, we will make this dependency optional, but still provide support |
||||
for it. As of now, it is necessary. |
||||
|
||||
Building the Documentation |
||||
========================== |
||||
The documentation is built with `Doxygen <https://doxygen.nl/>`__ and `Sphinx <https://www.sphinx-doc.org/en>`__. |
||||
So, first make sure you have Doxygen installed on your system, and make sure it is added |
||||
to your system path. Then, you will have to create a Python virtual environment |
||||
in the repository folder |
||||
|
||||
.. code-block:: bash |
||||
|
||||
$ python3 -m venv .venv |
||||
|
||||
After installing the required Python packages |
||||
|
||||
.. code-block:: bash |
||||
|
||||
$ pip install -r requirements |
||||
|
||||
you can now run the script |
||||
|
||||
.. code-block:: bash |
||||
|
||||
$ ./build_docs |
@ -0,0 +1,2 @@ |
||||
doxygen docs/Doxyfile |
||||
sphinx-build -b html docs/source docs/build/html |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,20 @@ |
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help: |
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile |
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile |
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
@ -0,0 +1,35 @@ |
||||
@ECHO OFF |
||||
|
||||
pushd %~dp0 |
||||
|
||||
REM Command file for Sphinx documentation |
||||
|
||||
if "%SPHINXBUILD%" == "" ( |
||||
set SPHINXBUILD=sphinx-build |
||||
) |
||||
set SOURCEDIR=source |
||||
set BUILDDIR=build |
||||
|
||||
if "%1" == "" goto help |
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL |
||||
if errorlevel 9009 ( |
||||
echo. |
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx |
||||
echo.installed, then set the SPHINXBUILD environment variable to point |
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you |
||||
echo.may add the Sphinx directory to PATH. |
||||
echo. |
||||
echo.If you don't have Sphinx installed, grab it from |
||||
echo.http://sphinx-doc.org/ |
||||
exit /b 1 |
||||
) |
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% |
||||
goto end |
||||
|
||||
:help |
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% |
||||
|
||||
:end |
||||
popd |
@ -0,0 +1,39 @@ |
||||
sections = { |
||||
"mesh_prep": 1, |
||||
"matrix_assembly": 2, |
||||
"bc_calc": 3, |
||||
"timestep": 4, |
||||
} |
||||
|
||||
|
||||
window.MathJax = { |
||||
loader: {load: ['[tex]/tagformat', '[tex]/ams']}, |
||||
tex: { |
||||
packages: {'[+]': ['tagformat', 'ams']}, |
||||
macros: { |
||||
dd: "{\\, \\mathrm{d}}", |
||||
E: "{\\mathbf{E}}", |
||||
H: "{\\mathbf{H}}", |
||||
J: "{\\mathbf{J}}", |
||||
D: "{\\mathbf{D}}", |
||||
B: "{\\mathbf{B}}", |
||||
M: "{\\mathbf{M}}", |
||||
tbE: "{\\tilde{\\E}}", |
||||
tbH: "{\\tilde{\\H}}", |
||||
tE: "{\\tilde{E}}", |
||||
tH: "{\\tilde{H}}", |
||||
tphi: "{\\tilde{\\phi}}", |
||||
curl: ["{\\nabla \\times {#1}}", 1], |
||||
div: ["{\\nabla \\cdot {#1}}", 1], |
||||
tens: ["{\\bar{\\bar{{#1}}}}", 1], |
||||
}, |
||||
tags: 'ams', |
||||
tagformat: { |
||||
number: (n) => sections[window.location.pathname.split("/").pop().split(".")[0]] + '.' + n, |
||||
}, |
||||
ams: { |
||||
multilineWidth: '100%', |
||||
multilineIndent: '50em' |
||||
} |
||||
}, |
||||
} |
@ -0,0 +1,26 @@ |
||||
======= |
||||
Array.h |
||||
======= |
||||
|
||||
The ``Array.h`` header file contains the Array class, and its related classes. For this |
||||
file only, assume that every functions is callable on both host and device unless |
||||
explicitly mentioned otherwise. |
||||
|
||||
CudaTools::Shape |
||||
---------------- |
||||
.. doxygenclass:: CudaTools::Shape |
||||
:members: |
||||
:allow-dot-graphs: |
||||
|
||||
CudaTools::ArrayIterator<T> |
||||
--------------------------- |
||||
.. doxygenclass:: CudaTools::ArrayIterator |
||||
:members: |
||||
:allow-dot-graphs: |
||||
|
||||
CudaTools::Array<T> |
||||
------------------- |
||||
.. doxygenclass:: CudaTools::Array |
||||
:members: |
||||
:private-members: |
||||
:allow-dot-graphs: |
@ -0,0 +1,45 @@ |
||||
====== |
||||
BLAS.h |
||||
====== |
||||
|
||||
The ``BLAS.h`` header file contains some BLAS functions, and some related |
||||
classes for those functions. |
||||
|
||||
BLAS Functions |
||||
============== |
||||
Currently, these are the supported BLAS functions. They are inherited mainly |
||||
from the cuBLAS API, and condensed into a unified functions. The plan is to |
||||
add them as necessary. |
||||
|
||||
CudaTools::BLAS::GEMV<T> |
||||
------------------------ |
||||
.. doxygenfunction:: CudaTools::BLAS::GEMV |
||||
|
||||
CudaTools::BLAS::GEMM<T> |
||||
------------------------ |
||||
.. doxygenfunction:: CudaTools::BLAS::GEMM |
||||
|
||||
CudaTools::BLAS::DGMM<T> |
||||
------------------------ |
||||
.. doxygenfunction:: CudaTools::BLAS::DGMM |
||||
|
||||
BLAS Classes |
||||
============ |
||||
|
||||
These classes also inherit functions from the cuBLAS API, but are packaged |
||||
into classes that are more intuitive and hide external details. |
||||
|
||||
CudaTools::BLAS::Batch<T> |
||||
------------------------- |
||||
.. doxygenclass:: CudaTools::BLAS::Batch |
||||
:members: |
||||
|
||||
CudaTools::BLAS::PLUArray<T> |
||||
---------------------------- |
||||
.. doxygenclass:: CudaTools::BLAS::PLUArray |
||||
:members: |
||||
|
||||
CudaTools::BLAS::PLUBatch<T> |
||||
---------------------------- |
||||
.. doxygenclass:: CudaTools::BLAS::PLUBatch |
||||
:members: |
@ -0,0 +1,53 @@ |
||||
# Configuration file for the Sphinx documentation builder. |
||||
|
||||
# -- Project information |
||||
|
||||
project = 'DGEMS' |
||||
copyright = '2022' |
||||
author = 'Kenneth Jao, Qi Jian Lim' |
||||
|
||||
release = '0.1' |
||||
version = '0.1.0' |
||||
|
||||
# -- General configuration |
||||
|
||||
html_static_path = ["_static"] |
||||
html_js_files = ["js/mathjax-config.js"] |
||||
|
||||
extensions = [ |
||||
'sphinx.ext.duration', |
||||
'sphinx.ext.doctest', |
||||
'sphinx.ext.autodoc', |
||||
'sphinx.ext.autosummary', |
||||
'sphinx.ext.autosectionlabel', |
||||
'sphinx.ext.intersphinx', |
||||
'sphinx.ext.mathjax', |
||||
'sphinx.ext.graphviz', |
||||
'sphinxcontrib.bibtex', |
||||
'breathe', |
||||
] |
||||
|
||||
breathe_projects = {"DGEMS": "../build/xml"} |
||||
breathe_default_project = "DGEMS" |
||||
|
||||
bibtex_bibfiles = ['refs.bib'] |
||||
|
||||
mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" |
||||
|
||||
intersphinx_mapping = { |
||||
'python': ('https://docs.python.org/3/', None), |
||||
'sphinx': ('https://www.sphinx-doc.org/en/master/', None), |
||||
} |
||||
intersphinx_disabled_domains = ['std'] |
||||
|
||||
templates_path = ['_templates'] |
||||
|
||||
# -- Options for HTML output |
||||
|
||||
html_theme = 'sphinx_rtd_theme' |
||||
html_theme_options = { |
||||
'collapse_navigation': False, |
||||
} |
||||
|
||||
# -- Options for EPUB output |
||||
epub_show_urls = 'footnote' |
@ -0,0 +1,67 @@ |
||||
====== |
||||
Core.h |
||||
====== |
||||
|
||||
The ``Core.h`` header file defines several compiler flags and macros along with |
||||
a few core classes. |
||||
|
||||
Flags |
||||
===== |
||||
|
||||
Device Indicators |
||||
----------------- |
||||
.. doxygendefine:: CUDACC |
||||
.. doxygendefine:: DEVICE |
||||
|
||||
Host-Device Automation |
||||
---------------------- |
||||
.. doxygendefine:: HD |
||||
.. doxygendefine:: SHARED |
||||
|
||||
Compilation Options |
||||
------------------- |
||||
.. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES |
||||
|
||||
Macros |
||||
====== |
||||
|
||||
Kernel |
||||
------ |
||||
.. doxygendefine:: DECLARE_KERNEL |
||||
.. doxygendefine:: DEFINE_KERNEL |
||||
.. doxygendefine:: KERNEL |
||||
|
||||
Device Helpers |
||||
-------------- |
||||
|
||||
.. doxygendefine:: BASIC_LOOP |
||||
|
||||
Device Class |
||||
------------ |
||||
|
||||
.. doxygendefine:: DEVICE_CLASS |
||||
|
||||
|
||||
Classes and Structs |
||||
=================== |
||||
|
||||
CudaTools::StreamID |
||||
------------------- |
||||
|
||||
.. doxygenstruct:: CudaTools::StreamID |
||||
|
||||
CudaTools::Manager |
||||
------------------ |
||||
|
||||
.. doxygenclass:: CudaTools::Manager |
||||
:members: |
||||
|
||||
CudaTools::Kernel::Settings |
||||
--------------------------- |
||||
|
||||
.. doxygenstruct:: CudaTools::Kernel::Settings |
||||
:members: |
||||
|
||||
CudaTools::Kernel::Basic |
||||
------------------------ |
||||
.. doxygenfunction:: CudaTools::Kernel::basic |
@ -0,0 +1,25 @@ |
||||
========= |
||||
CudaTools |
||||
========= |
||||
This is the documentation for CudaTools, a header-only library and framework |
||||
for the development of CPU-CUDA compatible applications. Using CudaTools enables |
||||
the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to |
||||
introduce ``#ifdef`` statements when code is essentially identical between the targets. |
||||
|
||||
To get started, please head over to the :doc:`usage` section. For more detail on the |
||||
machinery underneath, please refer to the other other sections. |
||||
|
||||
.. note:: |
||||
|
||||
If you would like to contribute, please visit the `git page <https://git.acem.ece.illinois.edu/kjao/CudaTools>`__. |
||||
|
||||
Contents |
||||
======== |
||||
|
||||
.. toctree:: |
||||
:maxdepth: 2 |
||||
|
||||
usage |
||||
core |
||||
array |
||||
blas |
@ -0,0 +1,128 @@ |
||||
================== |
||||
Usage and Examples |
||||
================== |
||||
|
||||
|
||||
This library is broken up into three main parts, as well as a certain |
||||
compilation and linking framework: |
||||
|
||||
#. :ref:`Core Examples` |
||||
#. :ref:`Array Examples` |
||||
#. :ref:`BLAS Examples` |
||||
#. :ref:`Compilation and Linking` |
||||
|
||||
The ``Core.h`` header contains the necessary macros, flags and objects for interfacing with |
||||
basic kernel launching and the CUDA Runtime API. The ``Array.h`` header contains the ``CudaTools::Array`` |
||||
class which provides a device compatible Array-like class with easy memory management. Lastly, |
||||
the ``BLAS.h`` header provides functions BLAS functions through the the cuBLAS library for the GPU, |
||||
and Eigen for the CPU. Lastly, a templated Makefile is provided which can be used |
||||
for your own project, after following a few rules. |
||||
|
||||
The usage of this libary will be illustrated through examples, and further details |
||||
can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder. |
||||
Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers |
||||
to a function runnable on the CPU, and a device function refers to a function that is runnable |
||||
on a device. A kernel is a specific function that the host can call to be run on the device. |
||||
|
||||
Core Examples |
||||
============= |
||||
This file mainly introduces compiler macros and a few classes that are used to improve the |
||||
syntax between host and device code. To define and call a kernel, there are a few |
||||
macros provided. For example, |
||||
|
||||
.. code-block:: cpp |
||||
|
||||
DEFINE_KERNEL(add, int x, int y) { |
||||
printf("Kernel: %i\n", x + y); |
||||
} |
||||
|
||||
int main() { |
||||
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2. |
||||
return 0; |
||||
} |
||||
|
||||
The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments. |
||||
The second argument in the ``KERNEL()`` macro is are the launch parameters for |
||||
kernel. The launch parameters have several items, but for 'embarassingly parallel' |
||||
cases, we can simply generate the settings with the number of threads. More detail with |
||||
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example, |
||||
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail, |
||||
see :ref:`here <Macros>`. |
||||
|
||||
.. warning:: |
||||
These kernel definitions must be in a file that will be compiled by ``nvcc``. Also, |
||||
for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it |
||||
and make it available to other files. |
||||
|
||||
Since many applications used classes, a macro is provided to 'convert' a class into |
||||
being device-compatible. Following the previous example similarly, |
||||
|
||||
.. code-block:: cpp |
||||
|
||||
class intPair { |
||||
DEVICE_CLASS(intPair) |
||||
public: |
||||
int x, y; |
||||
|
||||
intPair(const int x_, const int y_) : x(x_), y(y_) { |
||||
allocateDevice(); // Allocates memory for this intPair on the device. |
||||
updateDevice().wait(); // Copies the memory on the host to the device and waits until finished. |
||||
}; |
||||
|
||||
HD void swap() { |
||||
int swap = x; |
||||
x = y; |
||||
y = swap; |
||||
}; |
||||
}; |
||||
|
||||
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); } |
||||
|
||||
int main() { |
||||
intPair pair(1, 2); |
||||
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2. |
||||
|
||||
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait(); |
||||
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished. |
||||
|
||||
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1. |
||||
return 0; |
||||
} |
||||
|
||||
In this example, we create a class called ``intPair``, which is then made available on the device through |
||||
the ``DEVICE_CLASS(name)`` macro. Specifically, that macro introduces a few functions, like |
||||
``allocateDevice()``, ``updateDevice()``, ``updateHost()``, and ``that()``. That last function |
||||
returns a pointer to the copy on the device. For more details, see :ref:`here <Device Class>`. If we were to pass in the host pointer of the ``intPair`` to the kernel, there would be a illegal memory access. |
||||
|
||||
The kernel argument list should **must** consist of pointers to objects, or a non-reference object. |
||||
Otherwise, compilation will fail. In general this is safer, as it forces the programmer to |
||||
acknowledge that the device copy is being passed. For the latter case of a non-reference object, |
||||
you should only do this if there is no issue in creating a copy of the original object. In the above |
||||
example, we could have done this, but for more complicated classes it may result in unwanted behavior. |
||||
|
||||
Lastly, since the point of classes is usually to have some member functions, to have them |
||||
available on the device, you must mark them with the compiler macro ``HD`` in front. |
||||
|
||||
We also introduce the ``wait()`` function, which waits for the command to complete before |
||||
continuing. Most calls that involve the device are asynchronous, so without proper blocking, |
||||
operations dependent on a previous command are not guaranteed to run correctly. If the code is |
||||
compiled for CPU, then everything will run synchronously, as per usual. |
||||
|
||||
.. note:: |
||||
Almost all functions that are asynchronous provide an optional 'stream' argument, |
||||
where you can give the name of the stream you wish to use. Different streams run |
||||
asynchronous, but operations on the same stream are FIFO. To define a stream to use |
||||
later, you must call ``CudaTools::Manager::get()->addStream("myStream")`` at some point |
||||
before you use it. For more details, see :ref:`here <CudaTools::Manager>`. |
||||
|
||||
|
||||
Array Examples |
||||
============== |
||||
|
||||
|
||||
BLAS Examples |
||||
============= |
||||
|
||||
|
||||
Compilation and Linking |
||||
======================= |
@ -0,0 +1,4 @@ |
||||
Sphinx>=5.1.1 |
||||
sphinx-rtd-theme>=1.0.0 |
||||
sphinxcontrib-bibtex>=2.5.0 |
||||
breathe>=4.34.0 |
@ -0,0 +1,95 @@ |
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
|
||||
TARGET = coreKernel
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
|
||||
# Should not need to modify below.
|
||||
|
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean |
||||
|
||||
all : cpu gpu |
||||
|
||||
cpu: $(TARGET)CPU |
||||
gpu: $(TARGET)GPU |
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ) |
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(NVCC) --device-link $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS) |
||||
-include $(GPU_DEPS) |
||||
|
||||
$(CPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
clean: |
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,12 @@ |
||||
#define CUDATOOLS_IMPLEMENTATION |
||||
#include <Core.h> |
||||
|
||||
DEFINE_KERNEL(add, int x, int y) { |
||||
printf("Kernel: %i\n", x + y); |
||||
} |
||||
|
||||
int main() { |
||||
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||
return 0; |
||||
} |
||||
|
@ -0,0 +1,95 @@ |
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
|
||||
TARGET = coreClass
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
|
||||
# Should not need to modify below.
|
||||
|
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean |
||||
|
||||
all : cpu gpu |
||||
|
||||
cpu: $(TARGET)CPU |
||||
gpu: $(TARGET)GPU |
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ) |
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(NVCC) --device-link $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS) |
||||
-include $(GPU_DEPS) |
||||
|
||||
$(CPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
clean: |
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,34 @@ |
||||
#define CUDATOOLS_IMPLEMENTATION |
||||
#include <Core.h> |
||||
|
||||
class intPair { |
||||
DEVICE_CLASS(intPair) |
||||
public: |
||||
int x, y; |
||||
|
||||
intPair(const int x_, const int y_) : x(x_), y(y_) { |
||||
allocateDevice(); // Allocates memory for this intPair on the device.
|
||||
updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
|
||||
}; |
||||
|
||||
HD void swap() { |
||||
int swap = x; |
||||
x = y; |
||||
y = swap; |
||||
}; |
||||
}; |
||||
|
||||
DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); } |
||||
|
||||
int main() { |
||||
intPair pair(1, 2); |
||||
printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
|
||||
|
||||
KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait(); |
||||
pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
|
||||
|
||||
printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
|
||||
return 0; |
||||
} |
||||
|
||||
|
@ -0,0 +1,494 @@ |
||||
#define CUDATOOLS_IMPLEMENTATION |
||||
#define CUDATOOLS_ARRAY_MAX_AXES 8 |
||||
#include "Array.h" |
||||
#include "BLAS.h" |
||||
#include "Core.h" |
||||
|
||||
#include <Eigen/Core> |
||||
#include <chrono> |
||||
#include <complex> |
||||
|
||||
namespace CT = CudaTools; |
||||
|
||||
/////////////
|
||||
// Helpers //
|
||||
/////////////
|
||||
|
||||
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now() |
||||
|
||||
#define TIME_END(name) \ |
||||
auto end_##name = std::chrono::steady_clock::now(); \
|
||||
auto time_ms_##name = \
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count(); \
|
||||
auto time_mus_##name = \
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count(); \
|
||||
if (time_ms_##name == 0) { \
|
||||
printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \
|
||||
} else { \
|
||||
printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \
|
||||
} |
||||
|
||||
#define TIME(call, name) \ |
||||
TIME_START(name); \
|
||||
call; \
|
||||
TIME_END(name); |
||||
|
||||
#define TEST(predicate, name, msg) \ |
||||
failed += (predicate) ? 0 : 1; \
|
||||
printf("[%s] ", (predicate) ? "\033[1;32mPASS\033[0m" : "\033[1;31mFAIL\033[0m"); \
|
||||
printf("%s | %s.\n", name, msg); |
||||
|
||||
template <typename T> struct Type; |
||||
|
||||
#define REGISTER_PARSE_TYPE(X) \ |
||||
template <> struct Type<X> { static const std::string name; }; \
|
||||
const std::string Type<X>::name = #X |
||||
|
||||
REGISTER_PARSE_TYPE(uint8_t); |
||||
REGISTER_PARSE_TYPE(int16_t); |
||||
REGISTER_PARSE_TYPE(int32_t); |
||||
REGISTER_PARSE_TYPE(float); |
||||
REGISTER_PARSE_TYPE(double); |
||||
|
||||
std::string box(std::string str) { |
||||
std::string tops(str.size() + 6, '#'); |
||||
return tops + "\n## " + str + " ##\n" + tops; |
||||
} |
||||
|
||||
std::string box2(std::string str) { |
||||
std::string tops(str.size() - 5, '-'); |
||||
return tops + "\n|| " + str + " ||\n" + tops; |
||||
} |
||||
|
||||
std::string boxSmall(std::string str) { |
||||
std::string tops(6, '-'); |
||||
return tops + "[ " + str + " ]" + tops; |
||||
} |
||||
|
||||
std::string separator() { |
||||
std::string line(40, '='); |
||||
return "\n" + line + "\n"; |
||||
} |
||||
|
||||
template <typename T> std::string type() { return "\033[1;96m" + Type<T>::name + "\033[0m"; } |
||||
|
||||
CT::Shape makeRandom2DShape() { |
||||
std::random_device rd; |
||||
std::mt19937 mt(rd()); |
||||
std::uniform_int_distribution<uint32_t> dist(1, 15); |
||||
return CT::Shape({dist(mt), dist(mt)}); |
||||
} |
||||
|
||||
///////////
|
||||
// Tests //
|
||||
///////////
|
||||
|
||||
class TestClass { |
||||
DEVICE_CLASS(TestClass); |
||||
|
||||
public: |
||||
int x; |
||||
TestClass(const int x) : x(x) { |
||||
allocateDevice(); |
||||
updateDevice().wait(); |
||||
}; |
||||
}; |
||||
|
||||
DEFINE_KERNEL(times, const CT::Array<int> arr) { |
||||
BASIC_LOOP(arr.shape().length()) { arr[iThread] *= 2; } |
||||
} |
||||
|
||||
DEFINE_KERNEL(classTest, TestClass* const test) { test->x = 100; } |
||||
|
||||
struct MacroTests { |
||||
static uint32_t Kernel() { |
||||
uint32_t failed = 0; |
||||
CT::Array<int> A = CT::Array<int>::constant({10}, 1); |
||||
A.updateDevice().wait(); |
||||
KERNEL(times, CT::Kernel::basic(A.shape().items()), A.view()).wait(); |
||||
A.updateHost().wait(); |
||||
|
||||
uint32_t errors = 0; |
||||
for (auto it = A.begin(); it != A.end(); ++it) { |
||||
if (*it != 2) ++errors; |
||||
} |
||||
|
||||
std::ostringstream msg; |
||||
msg << "Errors: " << errors; |
||||
TEST(errors == 0, "Kernel", msg.str().c_str()); |
||||
return failed; |
||||
}; |
||||
|
||||
static uint32_t Class() { |
||||
uint32_t failed = 0; |
||||
TestClass test(1); |
||||
KERNEL(classTest, CT::Kernel::basic(1), test.that()).wait(); |
||||
test.updateHost().wait(); |
||||
|
||||
TEST(test.x == 100, "Class", "Errors: 0"); |
||||
return failed; |
||||
} |
||||
}; |
||||
|
||||
template <typename T> struct ArrayTests { |
||||
static uint32_t Indexing() { |
||||
uint32_t failed = 0; |
||||
CT::Array<T> A = CT::Array<T>::range(0, 240); |
||||
A.reshape({5, 3, 1, 4, 2, 1, 1, 2}); |
||||
|
||||
uint32_t errors = 0; |
||||
for (uint32_t i = 0; i < 5; ++i) { |
||||
for (uint32_t j = 0; j < 3; ++j) { |
||||
for (uint32_t k = 0; k < 4; ++k) { |
||||
for (uint32_t l = 0; l < 2; ++l) { |
||||
for (uint32_t m = 0; m < 2; ++m) { |
||||
if ((T)A[i][j][0][k][l][0][0][m] != (T)A[{i, j, 0, k, l, 0, 0, m}]) { |
||||
++errors; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
std::ostringstream msg; |
||||
msg << "Errors: " << errors; |
||||
TEST(errors == 0, "Element", msg.str().c_str()); |
||||
|
||||
errors = 0; |
||||
CT::Array<T> ApartGroup_1 = A[{2, 2}]; |
||||
CT::Array<T> ApartIndiv_1 = A[2][2]; |
||||
for (uint32_t k = 0; k < 4; ++k) { |
||||
for (uint32_t l = 0; l < 2; ++l) { |
||||
for (uint32_t m = 0; m < 2; ++m) { |
||||
if ((T)ApartIndiv_1[0][k][l][0][0][m] != (T)ApartGroup_1[{0, k, l, 0, 0, m}]) { |
||||
++errors; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
msg.str(""); |
||||
msg << "Errors: " << errors; |
||||
TEST(errors == 0, "Axis (1/2)", msg.str().c_str()); |
||||
|
||||
errors = 0; |
||||
CT::Array<T> ApartGroup_2 = A[{3, 2, 0, 3}]; |
||||
CT::Array<T> ApartIndiv_2 = A[3][2][0][3]; |
||||
|
||||
for (uint32_t l = 0; l < 2; ++l) { |
||||
for (uint32_t m = 0; m < 2; ++m) { |
||||
if ((T)ApartIndiv_2[l][0][0][m] != (T)ApartGroup_2[{l, 0, 0, m}]) { |
||||
++errors; |
||||
} |
||||
} |
||||
} |
||||
|
||||
msg.str(""); |
||||
msg << "Errors: " << errors; |
||||
TEST(errors == 0, "Axis (2/2)", msg.str().c_str()); |
||||
return failed; |
||||
}; |
||||
|
||||
static uint32_t Slicing() { |
||||
uint32_t failed = 0; |
||||
CT::Array<T> A = CT::Array<T>::constant({4, 5, 5}, 0); |
||||
|
||||
CT::Array<T> Aslice = A.slice({{0, 3}, {1, 4}, {1, 4}}); |
||||
T num = (T)1; |
||||
for (auto it = Aslice.begin(); it != Aslice.end(); ++it) { |
||||
*it = num; |
||||
++num; |
||||
} |
||||
|
||||
CT::Array<T> Aslice2 = A[3].slice({{0, 5}, {0, 1}}); |
||||
num = (T)-1; |
||||
for (auto it = Aslice2.begin(); it != Aslice2.end(); ++it) { |
||||
*it = num; |
||||
--num; |
||||
} |
||||
|
||||
uint32_t errors = 0; |
||||
for (int i = 0; i < 3; ++i) { |
||||
for (int j = 0; j < 3; ++j) { |
||||
for (int k = 0; k < 3; ++k) { |
||||
if ((T)A[i][1 + j][1 + k] != (T)(9 * i + 3 * j + k + 1)) { |
||||
++errors; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
std::ostringstream msg; |
||||
msg << "Errors: " << errors; |
||||
TEST(errors == 0, "Block", msg.str().c_str()); |
||||
|
||||
errors = 0; |
||||
for (int i = 0; i < 5; ++i) { |
||||
if ((T)A[3][i][0] != (T)(-(i + 1))) { |
||||
++errors; |
||||
} |
||||
} |
||||
|
||||
msg.str(""); |
||||
msg << "Errors: " << errors; |
||||
TEST(errors == 0, "Column", msg.str().c_str()); |
||||
return failed; |
||||
} |
||||
}; |
||||
|
||||
template <typename T> struct BLASTests { |
||||
static double thres; |
||||
static uint32_t GEMV(int attempts) { |
||||
uint32_t failed = 0; |
||||
for (int i = 0; i < attempts; i++) { |
||||
CT::Shape Ashape = makeRandom2DShape(); |
||||
CT::Shape xshape = CT::Shape({Ashape.cols(), 1}); |
||||
CT::Shape yshape = CT::Shape({Ashape.rows(), 1}); |
||||
|
||||
CT::Array<T> A(Ashape); |
||||
CT::Array<T> x(xshape); |
||||
CT::Array<T> y(yshape); |
||||
|
||||
A.setRandom(-100, 100); |
||||
x.setRandom(-100, 100); |
||||
|
||||
A.updateDevice(); |
||||
x.updateDevice().wait(); |
||||
|
||||
CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait(); |
||||
y.updateHost().wait(); |
||||
|
||||
CT::Array<T> yTest(yshape, true); |
||||
yTest.eigenMap() = A.eigenMap() * x.eigenMap(); |
||||
|
||||
double norm = (y.eigenMap() - y.eigenMap()).norm(); |
||||
|
||||
std::ostringstream name; |
||||
name << "GEMV (" << i + 1 << "/" << attempts << ")"; |
||||
std::ostringstream msg; |
||||
msg << "Matrix Shape: " << Ashape << ", " |
||||
<< "Residual: " << norm; |
||||
TEST(norm < thres, name.str().c_str(), msg.str().c_str()); |
||||
} |
||||
return failed; |
||||
}; |
||||
|
||||
static uint32_t GEMVBroadcast() { |
||||
uint32_t failed = 0; |
||||
CT::Shape Ashape = makeRandom2DShape(); |
||||
CT::Shape xshape = CT::Shape({Ashape.cols(), 1}); |
||||
CT::Shape yshape = CT::Shape({Ashape.rows(), 1}); |
||||
|
||||
CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()}); |
||||
CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()}); |
||||
CT::Array<T> y({2, 3, yshape.rows(), yshape.cols()}); |
||||
|
||||
A.setRandom(-100, 100); |
||||
x.setRandom(-100, 100); |
||||
|
||||
A.updateDevice(); |
||||
x.updateDevice().wait(); |
||||
|
||||
CT::BLAS::GEMV<T>(1.0, A, x, 0.0, y).wait(); |
||||
y.updateHost().wait(); |
||||
|
||||
double norm = 0; |
||||
CT::Array<T> yTest(yshape, true); |
||||
for (int i = 0; i < 2; ++i) { |
||||
for (int j = 0; j < 3; ++j) { |
||||
yTest.eigenMap() = A[i][j].eigenMap() * x[i][j].eigenMap(); |
||||
norm += (yTest.eigenMap() - y[i][j].eigenMap()).norm(); |
||||
} |
||||
} |
||||
|
||||
std::ostringstream msg; |
||||
msg << "Matrix Shape: " << Ashape << ", " |
||||
<< "Residual: " << norm; |
||||
TEST(norm < thres, "GEMV Broadcast", msg.str().c_str()); |
||||
return failed; |
||||
}; |
||||
|
||||
static uint32_t GEMM(int attempts) { |
||||
uint32_t failed = 0; |
||||
for (int i = 0; i < attempts; i++) { |
||||
CT::Shape Ashape = makeRandom2DShape(); |
||||
CT::Shape Bshape = makeRandom2DShape(); |
||||
Bshape = CT::Shape({Ashape.cols(), Bshape.cols()}); |
||||
|
||||
CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()}); |
||||
|
||||
CT::Array<T> A(Ashape); |
||||
CT::Array<T> B(Bshape); |
||||
CT::Array<T> C(Cshape); |
||||
|
||||
A.setRandom(-100, 100); |
||||
B.setRandom(-100, 100); |
||||
C.setRandom(-100, 100); |
||||
|
||||
A.updateDevice(); |
||||
B.updateDevice(); |
||||
C.updateDevice().wait(); |
||||
|
||||
CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait(); |
||||
C.updateHost().wait(); |
||||
|
||||
CT::Array<T> CTest(Cshape, true); |
||||
CTest.eigenMap() = A.eigenMap() * B.eigenMap(); |
||||
|
||||
double norm = (CTest.eigenMap() - C.eigenMap()).norm(); |
||||
|
||||
std::ostringstream name; |
||||
name << "GEMM (" << i + 1 << "/" << attempts << ")"; |
||||
std::ostringstream msg; |
||||
msg << "Matrix Shapes: " << Ashape << Bshape << ", " |
||||
<< "Residual: " << norm; |
||||
TEST(norm < thres, name.str().c_str(), msg.str().c_str()); |
||||
} |
||||
return failed; |
||||
}; |
||||
|
||||
static uint32_t GEMMBroadcast() { |
||||
uint32_t failed = 0; |
||||
CT::Shape Ashape = makeRandom2DShape(); |
||||
CT::Shape Bshape = makeRandom2DShape(); |
||||
Bshape = CT::Shape({Ashape.cols(), Bshape.cols()}); |
||||
|
||||
CT::Shape Cshape = CT::Shape({Ashape.rows(), Bshape.cols()}); |
||||
|
||||
CT::Array<T> A({2, 3, Ashape.rows(), Ashape.cols()}); |
||||
CT::Array<T> B({2, 3, Bshape.rows(), Bshape.cols()}); |
||||
CT::Array<T> C({2, 3, Cshape.rows(), Cshape.cols()}); |
||||
|
||||
A.setRandom(-100, 100); |
||||
B.setRandom(-100, 100); |
||||
|
||||
A.updateDevice(); |
||||
B.updateDevice(); |
||||
C.updateDevice().wait(); |
||||
|
||||
CT::BLAS::GEMM<T>(1.0, A, B, 0.0, C).wait(); |
||||
C.updateHost().wait(); |
||||
|
||||
double norm = 0; |
||||
CT::Array<T> CTest(Cshape, true); |
||||
for (int i = 0; i < 2; ++i) { |
||||
for (int j = 0; j < 3; ++j) { |
||||
CTest.eigenMap() = A[i][j].eigenMap() * B[i][j].eigenMap(); |
||||
norm += (CTest.eigenMap() - C[i][j].eigenMap()).norm(); |
||||
} |
||||
} |
||||
|
||||
std::ostringstream msg; |
||||
msg << "Matrix Shapes: " << Ashape << Bshape << ", " |
||||
<< "Residual: " << norm; |
||||
TEST(norm < thres, "GEMM Broadcast", msg.str().c_str()); |
||||
return failed; |
||||
}; |
||||
|
||||
static uint32_t PLU() { |
||||
uint32_t failed = 0; |
||||
CT::Shape Ashape = makeRandom2DShape(); |
||||
CT::Shape xshape = makeRandom2DShape(); |
||||
Ashape = CT::Shape({Ashape.rows(), Ashape.rows()}); |
||||
xshape = CT::Shape({Ashape.rows(), xshape.cols()}); |
||||
|
||||
CT::Array<T> A({2, 3, Ashape.rows(), Ashape.rows()}); |
||||
CT::Array<T> x({2, 3, xshape.rows(), xshape.cols()}); |
||||
CT::Array<T> b({2, 3, xshape.rows(), xshape.cols()}); |
||||
CT::Array<T> Ax({2, 3, xshape.rows(), xshape.cols()}); |
||||
|
||||
A.setRandom(-100, 100); |
||||
b.setRandom(-100, 100); |
||||
|
||||
CT::Array<T> LU(A.copy()); |
||||
x = b; |
||||
|
||||
A.updateDevice(); |
||||
LU.updateDevice(); |
||||
x.updateDevice().wait(); |
||||
|
||||
CT::BLAS::PLUBatch<T> luBatch(LU); |
||||
CT::BLAS::Batch<T> xBatch(x); |
||||
luBatch.computeLU().wait(); |
||||
luBatch.solve(xBatch).wait(); |
||||
|
||||
// Compute Ax and compare difference.
|
||||
CT::BLAS::GEMM<T>(1.0, A, x, 0.0, Ax).wait(); |
||||
Ax.updateHost(); |
||||
|
||||
double norm = 0; |
||||
for (int i = 0; i < 2; ++i) { |
||||
for (int j = 0; j < 3; ++j) { |
||||
norm += (Ax[i][j].eigenMap() - b[i][j].eigenMap()).norm(); |
||||
} |
||||
} |
||||
|
||||
std::ostringstream msg; |
||||
msg << "Matrix Shape: " << Ashape << xshape << ", " |
||||
<< "Residual: " << norm; |
||||
TEST(norm < thres, "PLU/Solve", msg.str().c_str()); |
||||
return failed; |
||||
} |
||||
}; |
||||
|
||||
template <> double BLASTests<float>::thres = 10e-1; |
||||
template <> double BLASTests<double>::thres = 10e-8; |
||||
|
||||
uint32_t doMacroTests() { |
||||
uint32_t failed = 0; |
||||
failed += MacroTests::Kernel(); |
||||
failed += MacroTests::Class(); |
||||
std::cout << "\n"; |
||||
return failed; |
||||
} |
||||
|
||||
template <typename T> uint32_t doArrayTests() { |
||||
uint32_t failed = 0; |
||||
std::cout << boxSmall("Index Tests : " + type<T>()) << "\n"; |
||||
failed += ArrayTests<T>::Indexing(); |
||||
std::cout << "\n" << boxSmall("Slice Tests : " + type<T>()) << "\n"; |
||||
failed += ArrayTests<T>::Slicing(); |
||||
std::cout << "\n"; |
||||
return failed; |
||||
} |
||||
|
||||
template <typename T> uint32_t doBLASTests() { |
||||
uint32_t failed = 0; |
||||
std::cout << boxSmall("GEMV Tests : " + type<T>()) << "\n"; |
||||
failed += BLASTests<T>::GEMV(5); |
||||
failed += BLASTests<T>::GEMVBroadcast(); |
||||
|
||||
std::cout << "\n" << boxSmall("GEMM Tests : " + type<T>()) << "\n"; |
||||
failed += BLASTests<T>::GEMM(5); |
||||
failed += BLASTests<T>::GEMMBroadcast(); |
||||
|
||||
std::cout << "\n" << boxSmall("PLU Tests : " + type<T>()) << "\n"; |
||||
failed += BLASTests<T>::PLU(); |
||||
std::cout << "\n"; |
||||
return failed; |
||||
} |
||||
|
||||
int main() { |
||||
uint32_t failed = 0; |
||||
std::cout << box("Macro Tests") << "\n"; |
||||
failed += doMacroTests(); |
||||
|
||||
std::cout << box("Array Tests") << "\n"; |
||||
// Test different sizes.
|
||||
failed += doArrayTests<uint8_t>(); |
||||
failed += doArrayTests<int16_t>(); |
||||
failed += doArrayTests<int32_t>(); |
||||
failed += doArrayTests<double>(); |
||||
|
||||
std::cout << box("BLAS Tests") << "\n"; |
||||
failed += doBLASTests<float>(); |
||||
failed += doBLASTests<double>(); |
||||
|
||||
constexpr uint32_t tests = 2 + 4 * 5 + 13 * 2; |
||||
std::ostringstream msg; |
||||
msg << ((failed == 0) ? "\033[1;32mPASS \033[0m(" : "\033[1;31mFAIL \033[0m(") |
||||
<< (tests - failed) << "/" << tests << ")"; |
||||
std::cout << box2(msg.str()) << "\n"; |
||||
|
||||
return 0; |
||||
} |
Loading…
Reference in new issue