Added initial cuSparse groundwork, and fp16 (__half) GEMM function.

main
Kenneth Jao 2 years ago
parent 31916ed752
commit 0add15db92
  1. 40
      Array.h
  2. 92
      BLAS.h
  3. 39
      Core.h
  4. 22
      Macros.h
  5. 2
      Makefile
  6. 2
      Makefile.template
  7. 10
      Sparse.h
  8. 49
      Types.h
  9. 11
      docs/source/usage.rst
  10. 2
      samples/5_SimpleGraph/main.cu.cpp
  11. 3
      tests.cu.cpp

@ -1,9 +1,9 @@
#ifndef CUDATOOLS_ARRAY_H
#define CUDATOOLS_ARRAY_H
#include "Complex.h"
#include "Core.h"
#include "Macros.h"
#include "Types.h"
#include <Eigen/Dense>
#include <cmath>
#include <complex>
@ -18,10 +18,9 @@
#define POINTER pHost
#endif
namespace CudaTools {
using namespace CudaTools::Types;
/** Type alises and lots of metaprogramming definitions, primarily dealing with
* the different numeric types and overrides. */
namespace CudaTools {
template <typename T>
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
@ -32,23 +31,6 @@ template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
template <typename T> struct ComplexUnderlying_S { typedef T type; };
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
template <typename T> struct ComplexConversion_S { typedef T type; };
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
template <typename T>
inline constexpr bool is_complex =
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
template <typename T> inline constexpr bool is_num = is_int<T> or is_float<T> or is_complex<T>;
template <typename T> class Array;
using Slice = std::pair<uint32_t, uint32_t>;
@ -576,7 +558,7 @@ template <typename T> class Array {
* Sets the values of the entire Array to a constant. This is restricted to numerical types.
*/
HD void setConstant(const T value) const {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
for (auto it = begin(); it != end(); ++it) {
*it = value;
}
@ -588,7 +570,7 @@ template <typename T> class Array {
* \brief Host only
*/
void setRandom(const T min, const T max) const {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
if constexpr (is_complex<T>) {
CT_ERROR_IF(max.real(), <, min.real(),
"Upper bound of range cannot be larger than lower bound");
@ -623,7 +605,7 @@ template <typename T> class Array {
* restricted to numerical types.
*/
HD void setRange(T min, const T step = 1) const {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
for (auto it = begin(); it != end(); ++it) {
*it = min;
min += step;
@ -650,7 +632,7 @@ template <typename T> class Array {
* \brief Host only
*/
static Array constant(const Shape& shape, const T value) {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
Array<T> arr(shape);
arr.setConstant(value);
return arr;
@ -662,7 +644,7 @@ template <typename T> class Array {
* \brief Host only
*/
static Array random(const Shape& shape, const T min, const T max) {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
Array<T> arr(shape);
arr.setRandom(min, max);
return arr;
@ -673,7 +655,7 @@ template <typename T> class Array {
* \brief Host only
*/
static Array range(const T min, const T max, const T step = 1) {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
Array<T> arr({(uint32_t)((max - min) / step)});
arr.setRange(min, step);
@ -698,7 +680,7 @@ template <typename T> class Array {
* \brief Host only
*/
Array transposed() const {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
Array<T> new_arr({mShape.rows(), mShape.cols()});
new_arr.eigenMap() = this->eigenMap().transpose().eval();
@ -711,7 +693,7 @@ template <typename T> class Array {
* \brief Host only
*/
void transpose() {
static_assert(is_num<T>, "Function only available on numeric types.");
static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
new_arr.eigenMap() = this->eigenMap().transpose().eval();

@ -2,16 +2,13 @@
#define CUDATOOLS_BLAS_H
#include "Array.h"
#include "Complex.h"
#include "Core.h"
#include "Macros.h"
#include "Types.h"
#ifdef CUDACC
#include <cuComplex.h>
#endif
using namespace CudaTools::Types;
namespace CudaTools {
namespace BLAS {
struct BatchInfo {
@ -19,17 +16,20 @@ struct BatchInfo {
uint32_t size;
};
template <typename T> struct Check {
struct Check {
template <typename T>
static void isAtLeast2D(const Array<T>& arr, const std::string& name = "Array") {
CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str());
};
template <typename T>
static void isSquare(const Array<T>& arr, const std::string& name = "Array") {
isAtLeast2D(arr, name);
CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str())
};
static void isValidMatmul(const Array<T>& A, const Array<T>& B, const Array<T>& C,
template <typename T, typename U, typename V>
static void isValidMatmul(const Array<T>& A, const Array<U>& B, const Array<V>& C,
const std::string& nameA = "A", const std::string& nameB = "B",
const std::string nameC = "C") {
isAtLeast2D(A, nameA);
@ -46,7 +46,7 @@ template <typename T> struct Check {
("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str());
};
static uint32_t getUpperItems(const Array<T>& arr) {
template <typename T> static uint32_t getUpperItems(const Array<T>& arr) {
uint32_t upperItems = 1;
for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) {
upperItems *= arr.shape().dim(iAxis);
@ -54,7 +54,8 @@ template <typename T> struct Check {
return upperItems;
};
static void matchUpperShape(const Array<T>& A, const Array<T>& B,
template <typename T, typename U>
static void matchUpperShape(const Array<T>& A, const Array<U>& B,
const std::string& nameA = "A", const std::string& nameB = "B") {
CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(),
(nameA + " and " + nameB + " shapes do not match for broadcasting").c_str());
@ -67,7 +68,8 @@ template <typename T> struct Check {
}
};
static BatchInfo isBroadcastable(const Array<T>& A, const Array<T>& B, const Array<T>& C,
template <typename T, typename U, typename V>
static BatchInfo isBroadcastable(const Array<T>& A, const Array<U>& B, const Array<V>& C,
const std::string& nameA = "A", const std::string& nameB = "B",
const std::string nameC = "C") {
isValidMatmul(A, B, C, nameA, nameB, nameC);
@ -130,7 +132,7 @@ template <typename T> class Batch {
Batch(const Array<T>& arr) {
CT_ERROR(arr.isView(), "Array cannot be a view");
mShape = Shape({arr.shape().rows(), arr.shape().cols()});
mBatchSize = mCount = Check<T>::getUpperItems(arr);
mBatchSize = mCount = Check::getUpperItems(arr);
mBatch = Array<T*>({mBatchSize});
@ -159,7 +161,7 @@ template <typename T> class Batch {
#endif
if (mCount == 0) {
mShape = arr.shape();
mBatchSize = mCount = Check<T>::getUpperItems(arr);
mBatchSize = mCount = Check::getUpperItems(arr);
} else {
CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch");
}
@ -195,15 +197,30 @@ template <typename T> struct CudaComplexConversion_S { typedef T type; };
#ifdef CUDACC
template <> struct CudaComplexConversion_S<complex64> { typedef cuComplex type; };
template <> struct CudaComplexConversion_S<complex128> { typedef cuDoubleComplex type; };
#else
#endif
template <typename T> using CudaComplexConversion = typename CudaComplexConversion_S<T>::type;
template <typename T> struct CublasTypeLetter_S { char letter; };
template <> struct CublasTypeLetter_S<real32> { char letter = 'S'; };
template <> struct CublasTypeLetter_S<real64> { char letter = 'D'; };
template <> struct CublasTypeLetter_S<complex64> { char letter = 'C'; };
template <> struct CublasTypeLetter_S<complex128> { char letter = 'Z'; };
#ifdef CUDACC
template <> struct CublasTypeLetter_S<real16> { char letter = 'H'; };
#endif
template <typename T> char CublasTypeLetter = CublasTypeLetter_S<T>::letter;
// Shorthands to reduce clutter.
#define CAST(var) reinterpret_cast<CudaComplexConversion<T>*>(var)
#define DCAST(var) reinterpret_cast<CudaComplexConversion<T>**>(var)
#define cublas(T, func) cublas##CublasTypeLetter<T>##func
template <typename T, typename F1, typename F2, typename F3, typename F4, typename... Args>
constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
if constexpr (std::is_same<T, real32>::value) {
@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) {
} else if constexpr (std::is_same<T, complex128>::value) {
CUBLAS_CHECK(f4(args...));
} else {
CT_ERROR(true, "BLAS functions are not callable with that type");
CT_ERROR(true, "This BLAS function is not callable with that type");
}
}
// If someone can think of a better solution, please tell me.
template <typename T, typename F1, typename F2, typename F3, typename F4, typename F5,
typename... Args>
constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) {
if constexpr (std::is_same<T, real32>::value) {
CUBLAS_CHECK(f1(args...));
} else if constexpr (std::is_same<T, real64>::value) {
CUBLAS_CHECK(f2(args...));
} else if constexpr (std::is_same<T, complex64>::value) {
CUBLAS_CHECK(f3(args...));
} else if constexpr (std::is_same<T, complex128>::value) {
CUBLAS_CHECK(f4(args...));
} else if constexpr (std::is_same<T, real16>::value) {
CUBLAS_CHECK(f5(args...));
} else {
CT_ERROR(true, "This BLAS function is not callable with that type");
}
}
@ -227,7 +263,7 @@ template <typename T>
StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta, const Array<T>& y,
const StreamID& stream = DEF_CUBLAS_STREAM) {
BatchInfo bi = Check<T>::isBroadcastable(A, x, y, "A", "x", "y");
BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y");
CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector");
CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector");
@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b),
CAST(y.dataDevice()), 1);
} else { // Greater than 2, so broadcast.
invoke<T>(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched,
cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows,
@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
* Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically
* broadcast the operation if applicable.
*/
template <typename T>
StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta, const Array<T>& C,
template <typename T, typename U, typename V>
StreamID GEMM(const T alpha, const Array<U>& A, const Array<U>& B, const T beta, const Array<V>& C,
const StreamID& stream = DEF_CUBLAS_STREAM) {
BatchInfo bi = Check<T>::isBroadcastable(A, B, C, "A", "B", "C");
BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C");
// A is m x k, B is k x n.
uint32_t m = A.shape().rows();
uint32_t k = A.shape().cols();
@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
T a = alpha, b = beta;
#ifdef CUDA
CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
if (bi.size == 1) {
invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm,
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
m);
invoke5<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm,
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()),
m);
} else { // Greater than 2, so broadcast.
invoke<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N,
CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA,
CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m,
bi.strideC, bi.size);
invoke5<T>(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched,
cublasZgemmStridedBatched, cublasHgemmStridedBatched,
Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB,
CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size);
}
#else
@ -487,7 +523,7 @@ class PLUBatch : public Batch<T> {
* Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions.
*/
PLUBatch(const Array<T>& arr) : Batch<T>(arr) {
Check<T>::isSquare(arr, "LU Array");
Check::isSquare(arr, "LU Array");
mPivotsBatch = Array<int32_t>({this->mBatchSize * this->mShape.rows()});
mInfoLU = Array<int32_t>({this->mBatchSize});

@ -81,6 +81,7 @@ class Manager {
#ifdef CUDACC
std::unordered_map<std::string, cudaStream_t> mStreams;
cublasHandle_t mCublas;
cusparseHandle_t mCusparse;
#endif
public:
/**
@ -94,6 +95,7 @@ class Manager {
#ifdef CUDACC
cudaStream_t stream(const StreamID& stream) const;
cublasHandle_t cublasHandle() const;
cusparseHandle_t cusparseHandle() const;
#endif
};
@ -391,6 +393,7 @@ Manager::Manager(const std::vector<std::string>& names) {
addStream(name);
}
CUBLAS_CHECK(cublasCreate(&mCublas));
CUSPARSE_CHECK(cusparseCreate(&mCusparse));
#endif
}
@ -400,6 +403,7 @@ Manager::~Manager() {
CUDA_CHECK(cudaStreamDestroy(it.second));
}
CUBLAS_CHECK(cublasDestroy(mCublas));
CUSPARSE_CHECK(cusparseDestroy(mCusparse));
#endif
}
@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const {
}
cublasHandle_t Manager::cublasHandle() const { return mCublas; };
cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
Manager Manager::mManagerInstance =
Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"});
#else
Manager Manager::mManagerInstance = Manager({""});
#endif
@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc
orig_stream.wait(*event);
}
#ifdef CUDACC
const char* cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#endif
}; // namespace CudaTools
#endif // CUDATOOLS_IMPLEMENTATION

@ -9,9 +9,6 @@
#define CUDACC
#endif
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)
#define DEVICE
#endif
@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#ifdef CUDACC
#include <cublas_v2.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#define DEVICE_FUNC __device__
#define HD __host__ __device__
#define SHARED __shared__
#define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
#else
#define DEVICE_FUNC
#define HD
#define SHARED
@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#endif // CUDACC
//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
///////////////////
// DEVICE MACROS //
///////////////////
@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
do { \
cublasStatus_t err = (call); \
if (err != CUBLAS_STATUS_SUCCESS) { \
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \
CudaTools::cublasGetErrorString(err)); \
printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \
throw std::exception(); \
} \
} while (0)
#define CUSPARSE_CHECK(call) \
do { \
cusparseStatus_t err = (call); \
if (err != CUSPARSE_STATUS_SUCCESS) { \
printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err)); \
throw std::exception(); \
} \
} while (0)

@ -7,7 +7,7 @@ INCLUDE :=
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=
LIBS_GPU := cuda cudart cublas
LIBS_GPU := cuda cudart cublas cusparse
TARGET = tests
SRC_DIR = .

@ -7,7 +7,7 @@ INCLUDE := <<Put extra include directories here, separated by a space>>
LIBS_DIR := <<Put library directories here, separated by a space>>
LIBS_DIR_GPU := /usr/local/cuda/lib64 <<Put extra include GPU library directories here, separated by a space>>
LIBS := <<Put the names of the libraries here, separated by a space>>
LIBS_GPU := cuda cudart cublas <<Put extra GPU libraries here, separated by a space>>
LIBS_GPU := cuda cudart cublas cusparse <<Put extra GPU libraries here, separated by a space>>
TARGET = <<Put the name of your target here>>
SRC_DIR = .

@ -0,0 +1,10 @@
#ifndef CUDATOOLS_SPARSE_H
#define CUDATOOLS_SPARSE_H
#include "Array.h"
#include "Core.h"
#include "Macros.h"
#include "Types.h"
#endif
#endif

@ -11,6 +11,25 @@
namespace CudaTools {
namespace Types {
using real32 = float; /**< Type alias for 32-bit floating point datatype. */
using real64 = double; /**< Type alias for 64-bit floating point datatype. */
#ifdef CUDACC
using real16 = __half;
using realb16 = __nv_bfloat16;
#else
using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise,
defaults to float. */
using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise,
defaults to float. */
#endif // CUDACC
template <typename T> class complex {
private:
T r = 0;
@ -107,11 +126,10 @@ template complex<real64> operator*<real64>(const real64, const complex<real64>);
template complex<real32> operator/<real32>(const real32, const complex<real32>);
template complex<real64> operator/<real64>(const real64, const complex<real64>);
}; // namespace CudaTools
#ifdef CUDACC
using complex64 = complex<real32>;
using complex128 = complex<real64>;
#ifdef CUDA
using complex64 = CudaTools::complex<real32>;
using complex128 = CudaTools::complex<real64>;
#else
using complex64 = std::complex<real32>; /**< Type alias for 64-bit complex floating point datatype.
* This adapts depending on the CUDA compilation flag, and
@ -122,4 +140,27 @@ using complex128 =
* CudaTools::complex<real64>. */
#endif
/** Type alises and lots of metaprogramming definitions, primarily dealing with
* the different numeric types and overrides. */
template <typename T> struct ComplexUnderlying_S { typedef T type; };
template <> struct ComplexUnderlying_S<complex64> { typedef float type; };
template <> struct ComplexUnderlying_S<complex128> { typedef double type; };
template <typename T> using ComplexUnderlying = typename ComplexUnderlying_S<T>::type;
template <typename T> struct ComplexConversion_S { typedef T type; };
template <> struct ComplexConversion_S<complex64> { typedef std::complex<float> type; };
template <> struct ComplexConversion_S<complex128> { typedef std::complex<double> type; };
template <typename T> using ComplexConversion = typename ComplexConversion_S<T>::type;
template <typename T> inline constexpr bool is_int = std::is_integral<T>::value;
template <typename T> inline constexpr bool is_float = std::is_floating_point<T>::value;
template <typename T>
inline constexpr bool is_complex =
std::is_same<T, complex64>::value or std::is_same<T, complex128>::value;
template <typename T> inline constexpr bool is_host_num = is_int<T> or is_float<T> or is_complex<T>;
}; // namespace Types
}; // namespace CudaTools
#endif

@ -21,7 +21,7 @@ for your own project, after following a few rules.
The usage of this libary will be illustrated through examples, and further details
can be found in the other sections. The examples are given in the `samples <https://git.acem.ece.illinois.edu/kjao/CudaTools/src/branch/main/samples>`__ folder.
Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers
Throughout this documentation, there are a few common terms that may appear. First, we refer to the CPU as the host, and the GPU as the device. So, a host function refers
to a function runnable on the CPU, and a device function refers to a function that is runnable
on a device. A kernel is a specific function that the host can call to be run on the device.
@ -42,17 +42,17 @@ macros provided. For example,
return 0;
}
The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments.
The ``KERNEL(name, ...)`` macro takes in the function name and its arguments.
The second argument in the ``KERNEL()`` macro is are the launch parameters for
kernel. The launch parameters have several items, but for 'embarassingly parallel'
cases, we can simply generate the settings with the number of threads. More detail with
cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with
creating launch parameters can be found :ref:`here <CudaTools::Kernel::Settings>`. In the above example,
there is only one thread. The rest of the arguments are just the kernel arguments. For more detail,
see :ref:`here <Macro Functions>`.
.. warning::
These kernel definitions must be in a file that will be compiled by ``nvcc``. Also,
for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it
for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it
and make it available to other files.
Since many applications used classes, a macro is provided to 'convert' a class into
@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com
mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
.. warning::
Notice that a view must be passed to the kernel, and not the original object. This
Notice that a view must be passed to the kernel, and not the original object, otherwise a copy
would be made.
The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
a few other functions.

@ -90,6 +90,7 @@ int main() {
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
// Executes process without graph.
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
std::cout << A.slice({{0, 10}}) << "\n";
@ -97,6 +98,7 @@ int main() {
A.setConstant(50);
B.setConstant(0);
// Executes process with graph.
CudaTools::GraphManager gm;
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
TIME(graph.execute().wait(), ExecuteGraph);

@ -2,13 +2,14 @@
#define CUDATOOLS_ARRAY_MAX_AXES 8
#include "Array.h"
#include "BLAS.h"
#include "Complex.h"
#include "Core.h"
#include "Types.h"
#include <Eigen/Core>
#include <chrono>
#include <complex>
using namespace CudaTools::Types;
namespace CT = CudaTools;
/////////////

Loading…
Cancel
Save