Added initial cuSparse groundwork, and fp16 (__half) GEMM function.

main
Kenneth Jao 2 years ago
parent 31916ed752
commit 0add15db92
  1. 40
      Array.h
  2. 86
      BLAS.h
  3. 39
      Core.h
  4. 22
      Macros.h
  5. 2
      Makefile
  6. 2
      Makefile.template
  7. 10
      Sparse.h
  8. 49
      Types.h
  9. 9
      docs/source/usage.rst
  10. 2
      samples/5_SimpleGraph/main.cu.cpp
  11. 3
      tests.cu.cpp

@ -1,9 +1,9 @@
#ifndef CUDATOOLS_ARRAY_H #ifndef CUDATOOLS_ARRAY_H
#define CUDATOOLS_ARRAY_H #define CUDATOOLS_ARRAY_H
#include "Complex.h"
#include "Core.h" #include "Core.h"
#include "Macros.h" #include "Macros.h"
#include "Types.h"
#include <Eigen/Dense> #include <Eigen/Dense>
#include <cmath> #include <cmath>
#include <complex> #include <complex>
@ -18,10 +18,9 @@
#define POINTER pHost #define POINTER pHost
#endif #endif
namespace CudaTools { using namespace CudaTools::Types;
/** Type alises and lots of metaprogramming definitions, primarily dealing with namespace CudaTools {
* the different numeric types and overrides. */
template <typename T> template <typename T>
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>; using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
@ -32,23 +31,6 @@ template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; }; template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type; template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
template <typename T> struct ComplexUnderlying_S { typedef T type; };
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
template <typename T> struct ComplexConversion_S { typedef T type; };
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
template <typename T>
inline constexpr bool is_complex =
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
template <typename T> inline constexpr bool is_num = is_int<T> or is_float<T> or is_complex<T>;
template <typename T> class Array; template <typename T> class Array;
using Slice = std::pair<uint32_t, uint32_t>; using Slice = std::pair<uint32_t, uint32_t>;
@ -576,7 +558,7 @@ template <typename T> class Array {
* Sets the values of the entire Array to a constant. This is restricted to numerical types. * Sets the values of the entire Array to a constant. This is restricted to numerical types.
*/ */
HD void setConstant(const T value) const { HD void setConstant(const T value) const {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
for (auto it = begin(); it != end(); ++it) { for (auto it = begin(); it != end(); ++it) {
*it = value; *it = value;
} }
@ -588,7 +570,7 @@ template <typename T> class Array {
* \brief Host only * \brief Host only
*/ */
void setRandom(const T min, const T max) const { void setRandom(const T min, const T max) const {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
if constexpr (is_complex<T>) { if constexpr (is_complex<T>) {
CT_ERROR_IF(max.real(), <, min.real(), CT_ERROR_IF(max.real(), <, min.real(),
"Upper bound of range cannot be larger than lower bound"); "Upper bound of range cannot be larger than lower bound");
@ -623,7 +605,7 @@ template <typename T> class Array {
* restricted to numerical types. * restricted to numerical types.
*/ */
HD void setRange(T min, const T step = 1) const { HD void setRange(T min, const T step = 1) const {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
for (auto it = begin(); it != end(); ++it) { for (auto it = begin(); it != end(); ++it) {
*it = min; *it = min;
min += step; min += step;
@ -650,7 +632,7 @@ template <typename T> class Array {
* \brief Host only * \brief Host only
*/ */
static Array constant(const Shape& shape, const T value) { static Array constant(const Shape& shape, const T value) {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
Array<T> arr(shape); Array<T> arr(shape);
arr.setConstant(value); arr.setConstant(value);
return arr; return arr;
@ -662,7 +644,7 @@ template <typename T> class Array {
* \brief Host only * \brief Host only
*/ */
static Array random(const Shape& shape, const T min, const T max) { static Array random(const Shape& shape, const T min, const T max) {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
Array<T> arr(shape); Array<T> arr(shape);
arr.setRandom(min, max); arr.setRandom(min, max);
return arr; return arr;
@ -673,7 +655,7 @@ template <typename T> class Array {
* \brief Host only * \brief Host only
*/ */
static Array range(const T min, const T max, const T step = 1) { static Array range(const T min, const T max, const T step = 1) {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
Array<T> arr({(uint32_t)((max - min) / step)}); Array<T> arr({(uint32_t)((max - min) / step)});
arr.setRange(min, step); arr.setRange(min, step);
@ -698,7 +680,7 @@ template <typename T> class Array {
* \brief Host only * \brief Host only
*/ */
Array transposed() const { Array transposed() const {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
Array<T> new_arr({mShape.rows(), mShape.cols()}); Array<T> new_arr({mShape.rows(), mShape.cols()});
new_arr.eigenMap() = this->eigenMap().transpose().eval(); new_arr.eigenMap() = this->eigenMap().transpose().eval();
@ -711,7 +693,7 @@ template <typename T> class Array {
* \brief Host only * \brief Host only
*/ */
void transpose() { void transpose() {
static_assert(is_num<T>, "Function only available on numeric types."); static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
Array<T> new_arr(*this, {mShape.cols(), mShape.rows()}); Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
new_arr.eigenMap() = this->eigenMap().transpose().eval(); new_arr.eigenMap() = this->eigenMap().transpose().eval();

@ -2,16 +2,13 @@
#define CUDATOOLS_BLAS_H #define CUDATOOLS_BLAS_H
#include "Array.h" #include "Array.h"
#include "Complex.h"
#include "Core.h" #include "Core.h"
#include "Macros.h" #include "Macros.h"
#include "Types.h"
#ifdef CUDACC using namespace CudaTools::Types;
#include <cuComplex.h>
#endif
namespace CudaTools { namespace CudaTools {
namespace BLAS { namespace BLAS {
struct BatchInfo { struct BatchInfo {
@ -19,17 +16,20 @@ struct BatchInfo {
uint32_t size; uint32_t size;
}; };
template <typename T> struct Check { struct Check {
template <typename T>
static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") { static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str()); CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
}; };
template <typename T>
static void isSquare(const Array<T>& arr, const std::string& name = "Array") { static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
isAtLeast2D(arr, name); isAtLeast2D(arr, name);
CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str()) CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
}; };
static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C, template <typename T, typename U, typename V>
static void isValidMatmul(const Array<T>& A, const Array<U>& B, const Array<V>& C,
const std::string& nameA = "A", const std::string& nameB = "B", const std::string& nameA = "A", const std::string& nameB = "B",
const std::string nameC = "C") { const std::string nameC = "C") {
isAtLeast2D(A, nameA); isAtLeast2D(A, nameA);
@ -46,7 +46,7 @@ template <typename T> struct Check {
("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str()); ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
}; };
static uint32_t getUpperItems(const Array<T>& arr) { template <typename T> static uint32_t getUpperItems(const Array<T>& arr) {
uint32_t upperItems = 1; uint32_t upperItems = 1;
for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) { for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
upperItems *= arr.shape().dim(iAxis); upperItems *= arr.shape().dim(iAxis);
@ -54,7 +54,8 @@ template <typename T> struct Check {
return upperItems; return upperItems;
}; };
static void matchUpperShape(const Array<T>& A, const Array<T>& B, template <typename T, typename U>
static void matchUpperShape(const Array<T>& A, const Array<U>& B,
const std::string& nameA = "A", const std::string& nameB = "B") { const std::string& nameA = "A", const std::string& nameB = "B") {
CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(), CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
@ -67,7 +68,8 @@ template <typename T> struct Check {
} }
}; };
static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C, template <typename T, typename U, typename V>
static BatchInfo isBroadcastable(const Array<T>& A, const Array<U>& B, const Array<V>& C,
const std::string& nameA = "A", const std::string& nameB = "B", const std::string& nameA = "A", const std::string& nameB = "B",
const std::string nameC = "C") { const std::string nameC = "C") {
isValidMatmul(A, B, C, nameA, nameB, nameC); isValidMatmul(A, B, C, nameA, nameB, nameC);
@ -130,7 +132,7 @@ template <typename T> class Batch {
Batch(const Array<T>& arr) { Batch(const Array<T>& arr) {
CT_ERROR(arr.isView(), "Array cannot be a view"); CT_ERROR(arr.isView(), "Array cannot be a view");
mShape = Shape({arr.shape().rows(), arr.shape().cols()}); mShape = Shape({arr.shape().rows(), arr.shape().cols()});
mBatchSize = mCount = Check<T>::getUpperItems(arr); mBatchSize = mCount = Check::getUpperItems(arr);
mBatch = Array<T*>({mBatchSize}); mBatch = Array<T*>({mBatchSize});
@ -159,7 +161,7 @@ template <typename T> class Batch {
#endif #endif
if (mCount == 0) { if (mCount == 0) {
mShape = arr.shape(); mShape = arr.shape();
mBatchSize = mCount = Check<T>::getUpperItems(arr); mBatchSize = mCount = Check::getUpperItems(arr);
} else { } else {
CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch"); CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
} }
@ -195,15 +197,30 @@ template <typename T> struct CudaComplexConversion_S { typedef T type; };
#ifdef CUDACC #ifdef CUDACC
template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; }; template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; }; template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
#else
#endif #endif
template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type; template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;
template <typename T> struct CublasTypeLetter_S { char letter; };
template <> struct CublasTypeLetter_S<real32> { char letter = 'S'; };
template <> struct CublasTypeLetter_S<real64> { char letter = 'D'; };
template <> struct CublasTypeLetter_S<complex64> { char letter = 'C'; };
template <> struct CublasTypeLetter_S<complex128> { char letter = 'Z'; };
#ifdef CUDACC
template <> struct CublasTypeLetter_S<real16> { char letter = 'H'; };
#endif
template <typename T> char CublasTypeLetter = CublasTypeLetter_S<T>::letter;
// Shorthands to reduce clutter. // Shorthands to reduce clutter.
#define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var) #define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
#define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var) #define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)
#define cublas(T, func) cublas##CublasTypeLetter<T>##func
template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args> template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) { constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
if constexpr (std::is_same<T, real32>::value) { if constexpr (std::is_same<T, real32>::value) {
@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
} else if constexpr (std::is_same<T, complex128>::value) { } else if constexpr (std::is_same<T, complex128>::value) {
CUBLAS_CHECK(f4(args...)); CUBLAS_CHECK(f4(args...));
} else { } else {
CT_ERROR(true, "BLAS functions are not callable with that type"); CT_ERROR(true, "This BLAS function is not callable with that type");
}
}
// If someone can think of a better solution, please tell me.
template <typename T, typename F1, typename F2, typename F3, typename F4, typename F5,
typename... Args>
constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) {
if constexpr (std::is_same<T, real32>::value) {
CUBLAS_CHECK(f1(args...));
} else if constexpr (std::is_same<T, real64>::value) {
CUBLAS_CHECK(f2(args...));
} else if constexpr (std::is_same<T, complex64>::value) {
CUBLAS_CHECK(f3(args...));
} else if constexpr (std::is_same<T, complex128>::value) {
CUBLAS_CHECK(f4(args...));
} else if constexpr (std::is_same<T, real16>::value) {
CUBLAS_CHECK(f5(args...));
} else {
CT_ERROR(true, "This BLAS function is not callable with that type");
} }
} }
@ -227,7 +263,7 @@ template <typename T>
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y, StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
const StreamID& stream = DEF_CUBLAS_STREAM) { const StreamID& stream = DEF_CUBLAS_STREAM) {
BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y"); BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y");
CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector"); CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector"); CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a), Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b), CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
CAST(y.dataDevice()), 1); CAST(y.dataDevice()), 1);
} else { // Greater than 2, so broadcast. } else { // Greater than 2, so broadcast.
invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched, invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
* Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
* broadcast the operation if applicable. * broadcast the operation if applicable.
*/ */
template <typename T> template <typename T, typename U, typename V>
StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C, StreamID GEMM(const T alpha, const Array<U>& A, const Array<U>& B, const T beta, const Array<V>& C,
const StreamID& stream = DEF_CUBLAS_STREAM) { const StreamID& stream = DEF_CUBLAS_STREAM) {
BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C"); BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C");
// A is m x k, B is k x n. // A is m x k, B is k x n.
uint32_t m = A.shape().rows(); uint32_t m = A.shape().rows();
uint32_t k = A.shape().cols(); uint32_t k = A.shape().cols();
@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
T a = alpha, b = beta; T a = alpha, b = beta;
#ifdef CUDA #ifdef CUDA
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream))); CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
if (bi.size == 1) { if (bi.size == 1) {
invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, invoke5<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm,
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a), Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()), CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
m); m);
} else { // Greater than 2, so broadcast. } else { // Greater than 2, so broadcast.
invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched, invoke5<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, cublasZgemmStridedBatched, cublasHgemmStridedBatched,
CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA, Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m, CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB,
bi.strideC, bi.size); CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size);
} }
#else #else
@ -487,7 +523,7 @@ class PLUBatch : public Batch<T> {
* Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions. * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
*/ */
PLUBatch(const Array<T>& arr) : Batch<T>(arr) { PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
Check<T>::isSquare(arr, "LU Array"); Check::isSquare(arr, "LU Array");
mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()}); mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
mInfoLU = Array<int32_t>({this->mBatchSize}); mInfoLU = Array<int32_t>({this->mBatchSize});

@ -81,6 +81,7 @@ class Manager {
#ifdef CUDACC #ifdef CUDACC
std::unordered_map<std::string, cudaStream_t> mStreams; std::unordered_map<std::string, cudaStream_t> mStreams;
cublasHandle_t mCublas; cublasHandle_t mCublas;
cusparseHandle_t mCusparse;
#endif #endif
public: public:
/** /**
@ -94,6 +95,7 @@ class Manager {
#ifdef CUDACC #ifdef CUDACC
cudaStream_t stream(const StreamID& stream) const; cudaStream_t stream(const StreamID& stream) const;
cublasHandle_t cublasHandle() const; cublasHandle_t cublasHandle() const;
cusparseHandle_t cusparseHandle() const;
#endif #endif
}; };
@ -391,6 +393,7 @@ Manager::Manager(const std::vector<std::string>& names) {
addStream(name); addStream(name);
} }
CUBLAS_CHECK(cublasCreate(&mCublas)); CUBLAS_CHECK(cublasCreate(&mCublas));
CUSPARSE_CHECK(cusparseCreate(&mCusparse));
#endif #endif
} }
@ -400,6 +403,7 @@ Manager::~Manager() {
CUDA_CHECK(cudaStreamDestroy(it.second)); CUDA_CHECK(cudaStreamDestroy(it.second));
} }
CUBLAS_CHECK(cublasDestroy(mCublas)); CUBLAS_CHECK(cublasDestroy(mCublas));
CUSPARSE_CHECK(cusparseDestroy(mCusparse));
#endif #endif
} }
@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const {
} }
cublasHandle_t Manager::cublasHandle() const { return mCublas; }; cublasHandle_t Manager::cublasHandle() const { return mCublas; };
cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"}); Manager Manager::mManagerInstance =
Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"});
#else #else
Manager Manager::mManagerInstance = Manager({""}); Manager Manager::mManagerInstance = Manager({""});
#endif #endif
@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
orig_stream.wait(*event); orig_stream.wait(*event);
} }
#ifdef CUDACC
const char* cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#endif
}; // namespace CudaTools }; // namespace CudaTools
#endif // CUDATOOLS_IMPLEMENTATION #endif // CUDATOOLS_IMPLEMENTATION

@ -9,9 +9,6 @@
#define CUDACC #define CUDACC
#endif #endif
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
#define DEVICE #define DEVICE
#endif #endif
@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#ifdef CUDACC #ifdef CUDACC
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cusparse.h>
#define DEVICE_FUNC __device__
#define HD __host__ __device__ #define HD __host__ __device__
#define SHARED __shared__ #define SHARED __shared__
#define KERNEL(call, ...) __global__ void call(__VA_ARGS__) #define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
#else #else
#define DEVICE_FUNC
#define HD #define HD
#define SHARED #define SHARED
@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#endif // CUDACC #endif // CUDACC
//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
/////////////////// ///////////////////
// DEVICE MACROS // // DEVICE MACROS //
/////////////////// ///////////////////
@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
do { \ do { \
cublasStatus_t err = (call); \ cublasStatus_t err = (call); \
if (err != CUBLAS_STATUS_SUCCESS) { \ if (err != CUBLAS_STATUS_SUCCESS) { \
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \ printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \
CudaTools::cublasGetErrorString(err)); \ throw std::exception(); \
} \
} while (0)
#define CUSPARSE_CHECK(call) \
do { \
cusparseStatus_t err = (call); \
if (err != CUSPARSE_STATUS_SUCCESS) { \
printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err)); \
throw std::exception(); \ throw std::exception(); \
} \ } \
} while (0) } while (0)

@ -7,7 +7,7 @@ INCLUDE :=
LIBS_DIR := LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64 LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS := LIBS :=
LIBS_GPU := cuda cudart cublas LIBS_GPU := cuda cudart cublas cusparse
TARGET = tests TARGET = tests
SRC_DIR = . SRC_DIR = .

@ -7,7 +7,7 @@ INCLUDE := <<Put extra include directories here, separated by a space>>
LIBS_DIR := <<Put library directories here, separated by a space>> LIBS_DIR := <<Put library directories here, separated by a space>>
LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>> LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
LIBS := <<Put the names of the libraries here, separated by a space>> LIBS := <<Put the names of the libraries here, separated by a space>>
LIBS_GPU := cuda cudart cublas <<Put extra GPU libraries here, separated by a space>> LIBS_GPU := cuda cudart cublas cusparse <<Put extra GPU libraries here, separated by a space>>
TARGET = <<Put the name of your target here>> TARGET = <<Put the name of your target here>>
SRC_DIR = . SRC_DIR = .

@ -0,0 +1,10 @@
#ifndef CUDATOOLS_SPARSE_H
#define CUDATOOLS_SPARSE_H
#include "Array.h"
#include "Core.h"
#include "Macros.h"
#include "Types.h"
#endif
#endif

@ -11,6 +11,25 @@
namespace CudaTools { namespace CudaTools {
namespace Types {
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#ifdef CUDACC
using real16 = __half;
using realb16 = __nv_bfloat16;
#else
using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise,
defaults to float. */
using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise,
defaults to float. */
#endif // CUDACC
template <typename T> class complex { template <typename T> class complex {
private: private:
T r = 0; T r = 0;
@ -107,11 +126,10 @@ template complex<real64> operator*<real64>(const real64, const complex<real64>);
template complex<real32> operator/<real32>(const real32, const complex<real32>); template complex<real32> operator/<real32>(const real32, const complex<real32>);
template complex<real64> operator/<real64>(const real64, const complex<real64>); template complex<real64> operator/<real64>(const real64, const complex<real64>);
}; // namespace CudaTools #ifdef CUDACC
using complex64 = complex<real32>;
using complex128 = complex<real64>;
#ifdef CUDA
using complex64 = CudaTools::complex<real32>;
using complex128 = CudaTools::complex<real64>;
#else #else
using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype. using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
* This adapts depending on the CUDA compilation flag, and * This adapts depending on the CUDA compilation flag, and
@ -122,4 +140,27 @@ using complex128 =
* CudaTools::complex<real64>. */ * CudaTools::complex<real64>. */
#endif #endif
/** Type alises and lots of metaprogramming definitions, primarily dealing with
* the different numeric types and overrides. */
template <typename T> struct ComplexUnderlying_S { typedef T type; };
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
template <typename T> struct ComplexConversion_S { typedef T type; };
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
template <typename T>
inline constexpr bool is_complex =
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
template <typename T> inline constexpr bool is_host_num = is_int<T> or is_float<T> or is_complex<T>;
}; // namespace Types
}; // namespace CudaTools
#endif #endif

@ -42,17 +42,17 @@ macros provided. For example,
return 0; return 0;
} }
The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments. The ``KERNEL(name, ...)`` macro takes in the function name and its arguments.
The second argument in the ``KERNEL()`` macro is are the launch parameters for The second argument in the ``KERNEL()`` macro is are the launch parameters for
kernel. The launch parameters have several items, but for 'embarassingly parallel' kernel. The launch parameters have several items, but for 'embarassingly parallel'
cases, we can simply generate the settings with the number of threads. More detail with cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example, creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail, there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
see :ref:`here <Macro Functions>`. see :ref:`here <Macro Functions>`.
.. warning:: .. warning::
These kernel definitions must be in a file that will be compiled by ``nvcc``. Also, These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it
and make it available to other files. and make it available to other files.
Since many applications used classes, a macro is provided to 'convert' a class into Since many applications used classes, a macro is provided to 'convert' a class into
@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com
mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism. mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
.. warning:: .. warning::
Notice that a view must be passed to the kernel, and not the original object. This Notice that a view must be passed to the kernel, and not the original object, otherwise a copy
would be made.
The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
a few other functions. a few other functions.

@ -90,6 +90,7 @@ int main() {
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50); CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0); CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
// Executes process without graph.
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph); TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
std::cout << A.slice({{0, 10}}) << "\n"; std::cout << A.slice({{0, 10}}) << "\n";
@ -97,6 +98,7 @@ int main() {
A.setConstant(50); A.setConstant(50);
B.setConstant(0); B.setConstant(0);
// Executes process with graph.
CudaTools::GraphManager gm; CudaTools::GraphManager gm;
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view()); CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
TIME(graph.execute().wait(), ExecuteGraph); TIME(graph.execute().wait(), ExecuteGraph);

@ -2,13 +2,14 @@
#define CUDATOOLS_ARRAY_MAX_AXES 8 #define CUDATOOLS_ARRAY_MAX_AXES 8
#include "Array.h" #include "Array.h"
#include "BLAS.h" #include "BLAS.h"
#include "Complex.h"
#include "Core.h" #include "Core.h"
#include "Types.h"
#include <Eigen/Core> #include <Eigen/Core>
#include <chrono> #include <chrono>
#include <complex> #include <complex>
using namespace CudaTools::Types;
namespace CT = CudaTools; namespace CT = CudaTools;
///////////// /////////////

Loading…
Cancel
Save