diff --git a/Array.h b/Array.h index 4a98369..efb8eee 100644 --- a/Array.h +++ b/Array.h @@ -1,9 +1,9 @@ #ifndef CUDATOOLS_ARRAY_H #define CUDATOOLS_ARRAY_H -#include "Complex.h" #include "Core.h" #include "Macros.h" +#include "Types.h" #include #include #include @@ -18,10 +18,9 @@ #define POINTER pHost #endif -namespace CudaTools { +using namespace CudaTools::Types; -/** Type alises and lots of metaprogramming definitions, primarily dealing with - * the different numeric types and overrides. */ +namespace CudaTools { template using EigenMat = Eigen::Matrix; @@ -32,23 +31,6 @@ template struct EigenAdaptConst_S { typedef EigenMapMat type; }; template struct EigenAdaptConst_S { typedef ConstEigenMapMat type; }; template using EigenAdaptConst = typename EigenAdaptConst_S::type; -template struct ComplexUnderlying_S { typedef T type; }; -template <> struct ComplexUnderlying_S { typedef float type; }; -template <> struct ComplexUnderlying_S { typedef double type; }; -template using ComplexUnderlying = typename ComplexUnderlying_S::type; - -template struct ComplexConversion_S { typedef T type; }; -template <> struct ComplexConversion_S { typedef std::complex type; }; -template <> struct ComplexConversion_S { typedef std::complex type; }; -template using ComplexConversion = typename ComplexConversion_S::type; - -template inline constexpr bool is_int = std::is_integral::value; -template inline constexpr bool is_float = std::is_floating_point::value; -template -inline constexpr bool is_complex = - std::is_same::value or std::is_same::value; -template inline constexpr bool is_num = is_int or is_float or is_complex; - template class Array; using Slice = std::pair; @@ -576,7 +558,7 @@ template class Array { * Sets the values of the entire Array to a constant. This is restricted to numerical types. */ HD void setConstant(const T value) const { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); for (auto it = begin(); it != end(); ++it) { *it = value; } @@ -588,7 +570,7 @@ template class Array { * \brief Host only */ void setRandom(const T min, const T max) const { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); if constexpr (is_complex) { CT_ERROR_IF(max.real(), <, min.real(), "Upper bound of range cannot be larger than lower bound"); @@ -623,7 +605,7 @@ template class Array { * restricted to numerical types. */ HD void setRange(T min, const T step = 1) const { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); for (auto it = begin(); it != end(); ++it) { *it = min; min += step; @@ -650,7 +632,7 @@ template class Array { * \brief Host only */ static Array constant(const Shape& shape, const T value) { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); Array arr(shape); arr.setConstant(value); return arr; @@ -662,7 +644,7 @@ template class Array { * \brief Host only */ static Array random(const Shape& shape, const T min, const T max) { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); Array arr(shape); arr.setRandom(min, max); return arr; @@ -673,7 +655,7 @@ template class Array { * \brief Host only */ static Array range(const T min, const T max, const T step = 1) { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); Array arr({(uint32_t)((max - min) / step)}); arr.setRange(min, step); @@ -698,7 +680,7 @@ template class Array { * \brief Host only */ Array transposed() const { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); Array new_arr({mShape.rows(), mShape.cols()}); new_arr.eigenMap() = this->eigenMap().transpose().eval(); @@ -711,7 +693,7 @@ template class Array { * \brief Host only */ void transpose() { - static_assert(is_num, "Function only available on numeric types."); + static_assert(is_host_num, "Function only available on host-compatible numeric types."); CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); Array new_arr(*this, {mShape.cols(), mShape.rows()}); new_arr.eigenMap() = this->eigenMap().transpose().eval(); diff --git a/BLAS.h b/BLAS.h index b270fb2..8369eb1 100644 --- a/BLAS.h +++ b/BLAS.h @@ -2,16 +2,13 @@ #define CUDATOOLS_BLAS_H #include "Array.h" -#include "Complex.h" #include "Core.h" #include "Macros.h" +#include "Types.h" -#ifdef CUDACC -#include -#endif +using namespace CudaTools::Types; namespace CudaTools { - namespace BLAS { struct BatchInfo { @@ -19,17 +16,20 @@ struct BatchInfo { uint32_t size; }; -template struct Check { +struct Check { + template static void isAtLeast2D(const Array& arr, const std::string& name = "Array") { CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str()); }; + template static void isSquare(const Array& arr, const std::string& name = "Array") { isAtLeast2D(arr, name); CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str()) }; - static void isValidMatmul(const Array& A, const Array& B, const Array& C, + template + static void isValidMatmul(const Array& A, const Array& B, const Array& C, const std::string& nameA = "A", const std::string& nameB = "B", const std::string nameC = "C") { isAtLeast2D(A, nameA); @@ -46,7 +46,7 @@ template struct Check { ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str()); }; - static uint32_t getUpperItems(const Array& arr) { + template static uint32_t getUpperItems(const Array& arr) { uint32_t upperItems = 1; for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) { upperItems *= arr.shape().dim(iAxis); @@ -54,7 +54,8 @@ template struct Check { return upperItems; }; - static void matchUpperShape(const Array& A, const Array& B, + template + static void matchUpperShape(const Array& A, const Array& B, const std::string& nameA = "A", const std::string& nameB = "B") { CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(), (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); @@ -67,7 +68,8 @@ template struct Check { } }; - static BatchInfo isBroadcastable(const Array& A, const Array& B, const Array& C, + template + static BatchInfo isBroadcastable(const Array& A, const Array& B, const Array& C, const std::string& nameA = "A", const std::string& nameB = "B", const std::string nameC = "C") { isValidMatmul(A, B, C, nameA, nameB, nameC); @@ -130,7 +132,7 @@ template class Batch { Batch(const Array& arr) { CT_ERROR(arr.isView(), "Array cannot be a view"); mShape = Shape({arr.shape().rows(), arr.shape().cols()}); - mBatchSize = mCount = Check::getUpperItems(arr); + mBatchSize = mCount = Check::getUpperItems(arr); mBatch = Array({mBatchSize}); @@ -159,7 +161,7 @@ template class Batch { #endif if (mCount == 0) { mShape = arr.shape(); - mBatchSize = mCount = Check::getUpperItems(arr); + mBatchSize = mCount = Check::getUpperItems(arr); } else { CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch"); } @@ -195,15 +197,30 @@ template struct CudaComplexConversion_S { typedef T type; }; #ifdef CUDACC template <> struct CudaComplexConversion_S { typedef cuComplex type; }; template <> struct CudaComplexConversion_S { typedef cuDoubleComplex type; }; +#else + #endif template using CudaComplexConversion = typename CudaComplexConversion_S::type; +template struct CublasTypeLetter_S { char letter; }; +template <> struct CublasTypeLetter_S { char letter = 'S'; }; +template <> struct CublasTypeLetter_S { char letter = 'D'; }; +template <> struct CublasTypeLetter_S { char letter = 'C'; }; +template <> struct CublasTypeLetter_S { char letter = 'Z'; }; +#ifdef CUDACC +template <> struct CublasTypeLetter_S { char letter = 'H'; }; +#endif + +template char CublasTypeLetter = CublasTypeLetter_S::letter; + // Shorthands to reduce clutter. #define CAST(var) reinterpret_cast*>(var) #define DCAST(var) reinterpret_cast**>(var) +#define cublas(T, func) cublas##CublasTypeLetter##func + template constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) { if constexpr (std::is_same::value) { @@ -215,7 +232,26 @@ constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) { } else if constexpr (std::is_same::value) { CUBLAS_CHECK(f4(args...)); } else { - CT_ERROR(true, "BLAS functions are not callable with that type"); + CT_ERROR(true, "This BLAS function is not callable with that type"); + } +} + +// If someone can think of a better solution, please tell me. +template +constexpr void invoke5(F1 f1, F2 f2, F3 f3, F4 f4, F5 f5, Args&&... args) { + if constexpr (std::is_same::value) { + CUBLAS_CHECK(f1(args...)); + } else if constexpr (std::is_same::value) { + CUBLAS_CHECK(f2(args...)); + } else if constexpr (std::is_same::value) { + CUBLAS_CHECK(f3(args...)); + } else if constexpr (std::is_same::value) { + CUBLAS_CHECK(f4(args...)); + } else if constexpr (std::is_same::value) { + CUBLAS_CHECK(f5(args...)); + } else { + CT_ERROR(true, "This BLAS function is not callable with that type"); } } @@ -227,7 +263,7 @@ template StreamID GEMV(const T alpha, const Array& A, const Array& x, const T beta, const Array& y, const StreamID& stream = DEF_CUBLAS_STREAM) { - BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y"); + BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y"); CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector"); CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector"); @@ -241,7 +277,6 @@ StreamID GEMV(const T alpha, const Array& A, const Array& x, const T beta, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a), CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b), CAST(y.dataDevice()), 1); - } else { // Greater than 2, so broadcast. invoke(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched, cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, @@ -269,11 +304,11 @@ StreamID GEMV(const T alpha, const Array& A, const Array& x, const T beta, * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically * broadcast the operation if applicable. */ -template -StreamID GEMM(const T alpha, const Array& A, const Array& B, const T beta, const Array& C, +template +StreamID GEMM(const T alpha, const Array& A, const Array& B, const T beta, const Array& C, const StreamID& stream = DEF_CUBLAS_STREAM) { - BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C"); + BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C"); // A is m x k, B is k x n. uint32_t m = A.shape().rows(); uint32_t k = A.shape().cols(); @@ -282,18 +317,19 @@ StreamID GEMM(const T alpha, const Array& A, const Array& B, const T beta, T a = alpha, b = beta; #ifdef CUDA CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream))); + if (bi.size == 1) { - invoke(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, - Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a), - CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()), - m); + invoke5(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, cublasHgemm, + Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a), + CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()), + m); } else { // Greater than 2, so broadcast. - invoke(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched, - cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, - CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA, - CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m, - bi.strideC, bi.size); + invoke5(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched, + cublasZgemmStridedBatched, cublasHgemmStridedBatched, + Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a), + CAST(A.dataDevice()), m, bi.strideA, CAST(B.dataDevice()), k, bi.strideB, + CAST(&b), CAST(C.dataDevice()), m, bi.strideC, bi.size); } #else @@ -487,7 +523,7 @@ class PLUBatch : public Batch { * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions. */ PLUBatch(const Array& arr) : Batch(arr) { - Check::isSquare(arr, "LU Array"); + Check::isSquare(arr, "LU Array"); mPivotsBatch = Array({this->mBatchSize * this->mShape.rows()}); mInfoLU = Array({this->mBatchSize}); diff --git a/Core.h b/Core.h index 82df51b..f6f10f6 100644 --- a/Core.h +++ b/Core.h @@ -81,6 +81,7 @@ class Manager { #ifdef CUDACC std::unordered_map mStreams; cublasHandle_t mCublas; + cusparseHandle_t mCusparse; #endif public: /** @@ -94,6 +95,7 @@ class Manager { #ifdef CUDACC cudaStream_t stream(const StreamID& stream) const; cublasHandle_t cublasHandle() const; + cusparseHandle_t cusparseHandle() const; #endif }; @@ -391,6 +393,7 @@ Manager::Manager(const std::vector& names) { addStream(name); } CUBLAS_CHECK(cublasCreate(&mCublas)); + CUSPARSE_CHECK(cusparseCreate(&mCusparse)); #endif } @@ -400,6 +403,7 @@ Manager::~Manager() { CUDA_CHECK(cudaStreamDestroy(it.second)); } CUBLAS_CHECK(cublasDestroy(mCublas)); + CUSPARSE_CHECK(cusparseDestroy(mCusparse)); #endif } @@ -439,8 +443,10 @@ cudaStream_t Manager::stream(const StreamID& stream) const { } cublasHandle_t Manager::cublasHandle() const { return mCublas; }; +cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; }; -Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"}); +Manager Manager::mManagerInstance = + Manager({"defaultMemory", "defaultCublas", "defaultCusparse", "defaultKernel"}); #else Manager Manager::mManagerInstance = Manager({""}); #endif @@ -674,37 +680,6 @@ void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branc orig_stream.wait(*event); } -#ifdef CUDACC -const char* cublasGetErrorString(cublasStatus_t error) { - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - } - - return ""; -} -#endif }; // namespace CudaTools #endif // CUDATOOLS_IMPLEMENTATION diff --git a/Macros.h b/Macros.h index 1586404..abf3e42 100644 --- a/Macros.h +++ b/Macros.h @@ -9,9 +9,6 @@ #define CUDACC #endif -using real32 = float; /**< Type alias for 32-bit floating point datatype. */ -using real64 = double; /**< Type alias for 64-bit floating point datatype. */ - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) #define DEVICE #endif @@ -124,14 +121,19 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */ #ifdef CUDACC #include +#include +#include #include +#include +#define DEVICE_FUNC __device__ #define HD __host__ __device__ #define SHARED __shared__ #define KERNEL(call, ...) __global__ void call(__VA_ARGS__) #else +#define DEVICE_FUNC #define HD #define SHARED @@ -139,8 +141,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */ #endif // CUDACC -//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__) - /////////////////// // DEVICE MACROS // /////////////////// @@ -252,8 +252,16 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */ do { \ cublasStatus_t err = (call); \ if (err != CUBLAS_STATUS_SUCCESS) { \ - printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \ - CudaTools::cublasGetErrorString(err)); \ + printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, cublasGetStatusName(err)); \ + throw std::exception(); \ + } \ + } while (0) + +#define CUSPARSE_CHECK(call) \ + do { \ + cusparseStatus_t err = (call); \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + printf("[cuSPARSE] %s:%d\n | %s\n", __FILE__, __LINE__, cusparseGetErrorName(err)); \ throw std::exception(); \ } \ } while (0) diff --git a/Makefile b/Makefile index 3a3b631..8bf16d1 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ INCLUDE := LIBS_DIR := LIBS_DIR_GPU := /usr/local/cuda/lib64 LIBS := -LIBS_GPU := cuda cudart cublas +LIBS_GPU := cuda cudart cublas cusparse TARGET = tests SRC_DIR = . diff --git a/Makefile.template b/Makefile.template index 48f72f8..83fc049 100644 --- a/Makefile.template +++ b/Makefile.template @@ -7,7 +7,7 @@ INCLUDE := <> LIBS_DIR := <> LIBS_DIR_GPU := /usr/local/cuda/lib64 <> LIBS := <> -LIBS_GPU := cuda cudart cublas <> +LIBS_GPU := cuda cudart cublas cusparse <> TARGET = <> SRC_DIR = . diff --git a/Sparse.h b/Sparse.h new file mode 100644 index 0000000..ca0baf3 --- /dev/null +++ b/Sparse.h @@ -0,0 +1,10 @@ +#ifndef CUDATOOLS_SPARSE_H +#define CUDATOOLS_SPARSE_H + +#include "Array.h" +#include "Core.h" +#include "Macros.h" +#include "Types.h" +#endif + +#endif diff --git a/Complex.h b/Types.h similarity index 67% rename from Complex.h rename to Types.h index e01235f..c165b92 100644 --- a/Complex.h +++ b/Types.h @@ -11,6 +11,25 @@ namespace CudaTools { +namespace Types { + +using real32 = float; /**< Type alias for 32-bit floating point datatype. */ +using real64 = double; /**< Type alias for 64-bit floating point datatype. */ + +#ifdef CUDACC + +using real16 = __half; +using realb16 = __nv_bfloat16; + +#else + +using real16 = float; /**< Type alias for 16-bit floating point datatype, when using GPU. Otherwise, + defaults to float. */ +using realb16 = float; /**< Type alias for the 16-bit bfloat datatype, when using GPU. Otherwise, + defaults to float. */ + +#endif // CUDACC + template class complex { private: T r = 0; @@ -107,11 +126,10 @@ template complex operator*(const real64, const complex); template complex operator/(const real32, const complex); template complex operator/(const real64, const complex); -}; // namespace CudaTools +#ifdef CUDACC +using complex64 = complex; +using complex128 = complex; -#ifdef CUDA -using complex64 = CudaTools::complex; -using complex128 = CudaTools::complex; #else using complex64 = std::complex; /**< Type alias for 64-bit complex floating point datatype. * This adapts depending on the CUDA compilation flag, and @@ -122,4 +140,27 @@ using complex128 = * CudaTools::complex. */ #endif +/** Type alises and lots of metaprogramming definitions, primarily dealing with + * the different numeric types and overrides. */ + +template struct ComplexUnderlying_S { typedef T type; }; +template <> struct ComplexUnderlying_S { typedef float type; }; +template <> struct ComplexUnderlying_S { typedef double type; }; +template using ComplexUnderlying = typename ComplexUnderlying_S::type; + +template struct ComplexConversion_S { typedef T type; }; +template <> struct ComplexConversion_S { typedef std::complex type; }; +template <> struct ComplexConversion_S { typedef std::complex type; }; +template using ComplexConversion = typename ComplexConversion_S::type; + +template inline constexpr bool is_int = std::is_integral::value; +template inline constexpr bool is_float = std::is_floating_point::value; +template +inline constexpr bool is_complex = + std::is_same::value or std::is_same::value; +template inline constexpr bool is_host_num = is_int or is_float or is_complex; + +}; // namespace Types +}; // namespace CudaTools + #endif diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 7d8b0e7..b2c38db 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -21,7 +21,7 @@ for your own project, after following a few rules. The usage of this libary will be illustrated through examples, and further details can be found in the other sections. The examples are given in the `samples `__ folder. -Throughout this documentation, there are a few common terms that may appear. First,we refer to the CPU as the host, and the GPU as the device. So, a host function refers +Throughout this documentation, there are a few common terms that may appear. First, we refer to the CPU as the host, and the GPU as the device. So, a host function refers to a function runnable on the CPU, and a device function refers to a function that is runnable on a device. A kernel is a specific function that the host can call to be run on the device. @@ -42,17 +42,17 @@ macros provided. For example, return 0; } -The ``DEFINE_KERNEL(name, ...)`` macro takes in the function name and its arguments. +The ``KERNEL(name, ...)`` macro takes in the function name and its arguments. The second argument in the ``KERNEL()`` macro is are the launch parameters for kernel. The launch parameters have several items, but for 'embarassingly parallel' -cases, we can simply generate the settings with the number of threads. More detail with +cases, we can simply generate the settings with the number of threads using ``CudaTools::Kernel::basic``. More detail with creating launch parameters can be found :ref:`here `. In the above example, there is only one thread. The rest of the arguments are just the kernel arguments. For more detail, see :ref:`here `. .. warning:: These kernel definitions must be in a file that will be compiled by ``nvcc``. Also, - for header files, there is an additional macro ``DECLARE_KERNEL(name, ...)`` to declare it + for header files, there is an additional macro ``KERNEL(name, ...)`` to declare it and make it available to other files. Since many applications used classes, a macro is provided to 'convert' a class into @@ -192,7 +192,8 @@ situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If com mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism. .. warning:: - Notice that a view must be passed to the kernel, and not the original object. This + Notice that a view must be passed to the kernel, and not the original object, otherwise a copy + would be made. The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and a few other functions. diff --git a/samples/5_SimpleGraph/main.cu.cpp b/samples/5_SimpleGraph/main.cu.cpp index f9b2afb..a457d84 100644 --- a/samples/5_SimpleGraph/main.cu.cpp +++ b/samples/5_SimpleGraph/main.cu.cpp @@ -90,6 +90,7 @@ int main() { CudaTools::Array A = CudaTools::Array::constant({100}, 50); CudaTools::Array B = CudaTools::Array::constant({100}, 0); + // Executes process without graph. TIME(doFunc(A.view(), B.view()), ExecuteNoGraph); std::cout << A.slice({{0, 10}}) << "\n"; @@ -97,6 +98,7 @@ int main() { A.setConstant(50); B.setConstant(0); + // Executes process with graph. CudaTools::GraphManager gm; CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view()); TIME(graph.execute().wait(), ExecuteGraph); diff --git a/tests.cu.cpp b/tests.cu.cpp index c224bd6..46a75b4 100644 --- a/tests.cu.cpp +++ b/tests.cu.cpp @@ -2,13 +2,14 @@ #define CUDATOOLS_ARRAY_MAX_AXES 8 #include "Array.h" #include "BLAS.h" -#include "Complex.h" #include "Core.h" +#include "Types.h" #include #include #include +using namespace CudaTools::Types; namespace CT = CudaTools; /////////////