From 00a27b66c32e23c65e93c3c270c8f1fb488d0eac Mon Sep 17 00:00:00 2001 From: Kenneth Jao Date: Wed, 24 May 2023 00:17:39 -0500 Subject: [PATCH] Added complex number support --- Array.h | 98 ++++++++++++++++++++----------- BLAS.h | 132 +++++++++++++++++++----------------------- Complex.h | 125 +++++++++++++++++++++++++++++++++++++++ Macros.h | 3 + Makefile.template | 95 ++++++++++++++++++++++++++++++ docs/source/core.rst | 16 +++-- docs/source/usage.rst | 20 ++++++- tests.cu.cpp | 19 ++++-- 8 files changed, 393 insertions(+), 115 deletions(-) create mode 100644 Complex.h create mode 100644 Makefile.template diff --git a/Array.h b/Array.h index 4efe07e..fc16666 100644 --- a/Array.h +++ b/Array.h @@ -1,11 +1,13 @@ -#ifndef ARRAY_H -#define ARRAY_H +#ifndef CUDATOOLS_ARRAY_H +#define CUDATOOLS_ARRAY_H +#include "Complex.h" #include "Core.h" #include "Macros.h" #include +#include +#include #include -#include #include #include @@ -17,18 +19,34 @@ namespace CudaTools { +/** Type alises and lots of metaprogramming definitions, primarily dealing with + * the different numeric types and overrides. */ + template using EigenMat = Eigen::Matrix; template using EigenMapMat = Eigen::Map>; template using ConstEigenMapMat = Eigen::Map>; -template struct EigenAdaptConst { typedef EigenMapMat type; }; -template struct EigenAdaptConst { typedef ConstEigenMapMat type; }; +template struct EigenAdaptConst_S { typedef EigenMapMat type; }; +template struct EigenAdaptConst_S { typedef ConstEigenMapMat type; }; +template using EigenAdaptConst = typename EigenAdaptConst_S::type; + +template struct ComplexUnderlying_S { typedef T type; }; +template <> struct ComplexUnderlying_S { typedef float type; }; +template <> struct ComplexUnderlying_S { typedef double type; }; +template using ComplexUnderlying = typename ComplexUnderlying_S::type; -#define ENABLE_IF(X) std::enable_if_t -#define IS_INT(T) std::is_integral::value -#define IS_FLOAT(T) std::is_floating_point::value -#define IS_NUM(T) IS_INT(T) or IS_FLOAT(T) +template struct ComplexConversion_S { typedef T type; }; +template <> struct ComplexConversion_S { typedef std::complex type; }; +template <> struct ComplexConversion_S { typedef std::complex type; }; +template using ComplexConversion = typename ComplexConversion_S::type; + +template inline constexpr bool is_int = std::is_integral::value; +template inline constexpr bool is_float = std::is_floating_point::value; +template +inline constexpr bool is_complex = + std::is_same::value or std::is_same::value; +template inline constexpr bool is_num = is_int or is_float or is_complex; template class Array; using Slice = std::pair; @@ -99,11 +117,11 @@ template class ArrayIterator { */ HD void advance(const int32_t amount) { if (amount < 0) { - for (uint32_t i = 0; i < abs(amount); ++i) { + for (uint32_t i = 0; i < std::abs(amount); ++i) { prev(); } } else { - for (uint32_t i = 0; i < abs(amount); ++i) { + for (uint32_t i = 0; i < std::abs(amount); ++i) { next(); } } @@ -211,7 +229,7 @@ template class Array { pHost = new T[shape.items()]; calcEnd(); if (noDevice) return; - pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); + pDevice = reinterpret_cast(CudaTools::malloc(shape.items() * sizeof(T))); }; /** @@ -226,7 +244,7 @@ template class Array { calcEnd(); #ifndef DEVICE if (noDevice) return; - pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); + pDevice = reinterpret_cast(CudaTools::malloc(shape.items() * sizeof(T))); #endif }; @@ -492,12 +510,13 @@ template class Array { /** * Returns the Eigen::Map of this Array. */ - typename EigenAdaptConst::type eigenMap() const { + EigenAdaptConst> eigenMap() const { uint32_t total_dim = mShape.mAxes; CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices") CT_ERROR_IF(total_dim, !=, 2, "Mapping to an Eigen array can only occur on two-dimensional arrays"); - return typename EigenAdaptConst::type(POINTER, mShape.rows(), mShape.cols()); + return EigenAdaptConst>((ComplexConversion*)POINTER, mShape.rows(), + mShape.cols()); }; /** @@ -508,7 +527,7 @@ template class Array { /** * Gets the pointer to this array, depending on host or device. */ - HD T* data() const { return POINTER; }; + HD ComplexConversion* data() const { return (ComplexConversion*)POINTER; }; /** * Returns the device pointer regardless of host or device. @@ -556,7 +575,7 @@ template class Array { * Sets the values of the entire Array to a constant. This is restricted to numerical types. */ HD void setConstant(const T value) const { - static_assert(IS_NUM(T), "Function only available on numeric types."); + static_assert(is_num, "Function only available on numeric types."); for (auto it = begin(); it != end(); ++it) { *it = value; } @@ -568,20 +587,33 @@ template class Array { * \brief Host only */ void setRandom(const T min, const T max) const { - static_assert(IS_NUM(T), "Function only available on numeric types."); - CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); + static_assert(is_num, "Function only available on numeric types."); + if constexpr (is_complex) { + CT_ERROR_IF(max.real(), <, min.real(), + "Upper bound of range cannot be larger than lower bound"); + CT_ERROR_IF(max.imag(), <, min.imag(), + "Upper bound of range cannot be larger than lower bound"); + } else { + CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); + } std::random_device rd; std::mt19937 mt(rd()); - if constexpr (IS_INT(T)) { + if constexpr (is_int) { std::uniform_int_distribution dist(min, max); for (auto it = begin(); it != end(); ++it) { *it = dist(mt); } - } else if constexpr (IS_FLOAT(T)) { + } else if constexpr (is_float) { std::uniform_real_distribution dist(min, max); for (auto it = begin(); it != end(); ++it) { *it = dist(mt); } + } else if constexpr (is_complex) { + std::uniform_real_distribution> distr(min.real(), max.real()); + std::uniform_real_distribution> disti(min.imag(), max.imag()); + for (auto it = begin(); it != end(); ++it) { + *it = T(distr(mt), disti(mt)); + } } }; @@ -590,7 +622,7 @@ template class Array { * restricted to numerical types. */ HD void setRange(T min, const T step = 1) const { - static_assert(IS_NUM(T), "Function only available on numeric types."); + static_assert(is_num, "Function only available on numeric types."); for (auto it = begin(); it != end(); ++it) { *it = min; min += step; @@ -601,7 +633,7 @@ template class Array { * to floating point types. */ HD void setLinspace(const T min, const T max) const { - static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); + static_assert(is_float, "Function only available on numeric floating types."); CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); T i = 0; T d = max - min; @@ -617,7 +649,7 @@ template class Array { * \brief Host only */ static Array constant(const Shape& shape, const T value) { - static_assert(IS_NUM(T), "Function only available on numeric types."); + static_assert(is_num, "Function only available on numeric types."); Array arr(shape); arr.setConstant(value); return arr; @@ -629,7 +661,7 @@ template class Array { * \brief Host only */ static Array random(const Shape& shape, const T min, const T max) { - static_assert(IS_NUM(T), "Function only available on numeric types."); + static_assert(is_num, "Function only available on numeric types."); Array arr(shape); arr.setRandom(min, max); return arr; @@ -640,7 +672,7 @@ template class Array { * \brief Host only */ static Array range(const T min, const T max, const T step = 1) { - static_assert(IS_NUM(T), "Function only available on numeric types."); + static_assert(is_num, "Function only available on numeric types."); CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); Array arr({(uint32_t)((max - min) / step)}); arr.setRange(min, step); @@ -653,7 +685,7 @@ template class Array { * \brief Host only */ static Array linspace(const T min, const T max, const uint32_t size) { - static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); + static_assert(is_float, "Function only available on numeric floating types."); Array arr({size}); arr.setLinspace(min, max); return arr; @@ -665,7 +697,7 @@ template class Array { * \brief Host only */ Array transposed() const { - static_assert(IS_NUM(T), "Function only available on numeric types."); + static_assert(is_num, "Function only available on numeric types."); CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); Array new_arr({mShape.rows(), mShape.cols()}); new_arr.eigenMap() = this->eigenMap().transpose().eval(); @@ -678,7 +710,7 @@ template class Array { * \brief Host only */ void transpose() { - static_assert(IS_NUM(T), "Function only available on numeric types."); + static_assert(is_num, "Function only available on numeric types."); CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); Array new_arr(*this, {mShape.cols(), mShape.rows()}); new_arr.eigenMap() = this->eigenMap().transpose().eval(); @@ -686,7 +718,7 @@ template class Array { }; void inverse() const { - static_assert(IS_FLOAT(T), "Function only available on floating numeric types."); + static_assert(is_float, "Function only available on floating numeric types."); CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays"); CT_ERROR_IF(shape().rows(), !=, shape().cols(), "Inverse can only occur on square matrices"); @@ -736,7 +768,7 @@ void printAxis(std::ostream& out, const Array& arr, const uint32_t axis, size } else { out << std::setw((i == 0) ? width - 1 : width); } - out << (T)arr[i] << ((i == arr.shape().items() - 1) ? "]" : ","); + out << static_cast(arr[i]) << ((i == arr.shape().items() - 1) ? "]" : ","); } } else if (arr.shape().axes() == 2) { for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { @@ -756,7 +788,7 @@ void printAxis(std::ostream& out, const Array& arr, const uint32_t axis, size template std::ostream& operator<<(std::ostream& out, const Array& arr) { size_t width = 0; - if constexpr (IS_NUM(T)) { + if constexpr (is_num) { T max_val = 0; bool negative = false; for (auto it = arr.begin(); it != arr.end(); ++it) { @@ -765,7 +797,7 @@ template std::ostream& operator<<(std::ostream& out, const Array } width = std::to_string(max_val).size() + 1; width += (negative) ? 1 : 0; - } else if constexpr (IS_FLOAT(T)) { + } else if constexpr (is_float) { T max_val = 0; bool negative = false; for (auto it = arr.begin(); it != arr.end(); ++it) { diff --git a/BLAS.h b/BLAS.h index b79f4e5..1ccf347 100644 --- a/BLAS.h +++ b/BLAS.h @@ -1,10 +1,15 @@ -#ifndef BLAS_H -#define BLAS_H +#ifndef CUDATOOLS_BLAS_H +#define CUDATOOLS_BLAS_H #include "Array.h" +#include "Complex.h" #include "Core.h" #include "Macros.h" +#ifdef CUDACC +#include +#endif + namespace CudaTools { namespace BLAS { @@ -186,12 +191,29 @@ template class Batch { // cuBLAS API // //////////////// -template -constexpr void invoke(F1 f1, F2 f2, Args&&... args) { - if constexpr (std::is_same::value) { +template struct CudaComplexConversion_S { typedef T type; }; +#ifdef CUDACC +template <> struct CudaComplexConversion_S { typedef cuComplex type; }; +template <> struct CudaComplexConversion_S { typedef cuDoubleComplex type; }; +#endif + +template using CudaComplexConversion = typename CudaComplexConversion_S::type; + +// Shorthands to reduce clutter. + +#define CAST(var) reinterpret_cast*>(var) +#define DCAST(var) reinterpret_cast**>(var) + +template +constexpr void invoke(F1 f1, F2 f2, F3 f3, F4 f4, Args&&... args) { + if constexpr (std::is_same::value) { CUBLAS_CHECK(f1(args...)); - } else if constexpr (std::is_same::value) { + } else if constexpr (std::is_same::value) { CUBLAS_CHECK(f2(args...)); + } else if constexpr (std::is_same::value) { + CUBLAS_CHECK(f3(args...)); + } else if constexpr (std::is_same::value) { + CUBLAS_CHECK(f4(args...)); } else { CT_ERROR(true, "BLAS functions are not callable with that type"); } @@ -216,14 +238,16 @@ StreamID GEMV(const T alpha, const Array& A, const Array& x, const T beta, CUBLAS_CHECK( cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); if (bi.size == 1) { - invoke(cublasSgemv, cublasDgemv, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, - &a, A.dataDevice(), rows, x.dataDevice(), 1, &b, y.dataDevice(), 1); + invoke(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv, + Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a), + CAST(A.dataDevice()), rows, CAST(x.dataDevice()), 1, CAST(&b), + CAST(y.dataDevice()), 1); } else { // Greater than 2, so broadcast. - invoke(cublasSgemvStridedBatched, cublasDgemvStridedBatched, - Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, &a, A.dataDevice(), rows, - bi.strideA, x.dataDevice(), 1, bi.strideB, &b, y.dataDevice(), 1, bi.strideC, - bi.size); + invoke(cublasSgemvStridedBatched, cublasDgemvStridedBatched, cublasCgemvStridedBatched, + cublasZgemvStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, + cols, CAST(&a), CAST(A.dataDevice()), rows, bi.strideA, CAST(x.dataDevice()), 1, + bi.strideB, CAST(&b), CAST(y.dataDevice()), 1, bi.strideC, bi.size); } #else @@ -261,15 +285,17 @@ StreamID GEMM(const T alpha, const Array& A, const Array& B, const T beta, CUBLAS_CHECK( cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); if (bi.size == 1) { - invoke(cublasSgemm, cublasDgemm, Manager::get()->cublasHandle(), CUBLAS_OP_N, - CUBLAS_OP_N, m, n, k, &a, A.dataDevice(), m, B.dataDevice(), k, &b, - C.dataDevice(), m); + invoke(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm, + Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a), + CAST(A.dataDevice()), m, CAST(B.dataDevice()), k, CAST(&b), CAST(C.dataDevice()), + m); } else { // Greater than 2, so broadcast. - invoke(cublasSgemmStridedBatched, cublasDgemmStridedBatched, - Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &a, - A.dataDevice(), m, bi.strideA, B.dataDevice(), k, bi.strideB, &b, C.dataDevice(), - m, bi.strideC, bi.size); + invoke(cublasSgemmStridedBatched, cublasDgemmStridedBatched, cublasCgemmStridedBatched, + cublasZgemmStridedBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, + CUBLAS_OP_N, m, n, k, CAST(&a), CAST(A.dataDevice()), m, bi.strideA, + CAST(B.dataDevice()), k, bi.strideB, CAST(&b), CAST(C.dataDevice()), m, + bi.strideC, bi.size); } #else @@ -314,8 +340,9 @@ StreamID DGMM(const Array& A, const Array& X, const Array& C, const boo auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; CUBLAS_CHECK( cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); - invoke(cublasSdgmm, cublasDdgmm, Manager::get()->cublasHandle(), m, n, A.dataDevice(), - A.shape().rows(), X.dataDevice(), 1, C.dataDevice(), m); + invoke(cublasSdgmm, cublasDdgmm, cublasCdgmm, cublasZdgmm, Manager::get()->cublasHandle(), m, + n, CAST(A.dataDevice()), A.shape().rows(), CAST(X.dataDevice()), 1, + CAST(C.dataDevice()), m); #else if (left) { C.eigenMap() = X.eigenMap().asDiagonal() * A.eigenMap(); @@ -341,13 +368,14 @@ template static Array empty({1, 1}); template static EigenMapMat empty_map = empty.eigenMap(); }; // namespace internal -template class PLUArray; +template or is_complex, bool> = true> class PLUArray; // This is a wrapper class for Eigen's class so we have more controlled access to // the underlying data. template class PartialPivLU : public Eigen::PartialPivLU>> { private: using Base = Eigen::PartialPivLU>>; - template friend class PLUArray; + template or is_complex, bool>> + friend class PLUArray; EigenMapMat mMapLU; EigenMapMat mMapPivots; @@ -382,7 +410,7 @@ template static PartialPivLU BlankPPLU = PartialPivLU(); /** * Class for storing the PLU decomposition an Array. This is restricted to floating point types. */ -template class PLUArray { +template or is_complex, bool>> class PLUArray { private: Array mLU; Array mPivots; @@ -443,7 +471,7 @@ template class PLUArray { * This is a batch version of PLUArray, to enable usage of the cuBLAS API. This is restricted to * floating point types. */ -template ::value, bool> = true> +template or is_complex, bool> = true> class PLUBatch : public Batch { private: Array mPivotsBatch; @@ -487,9 +515,10 @@ class PLUBatch : public Batch { uint32_t n = this->mShape.rows(); CUBLAS_CHECK( cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); - invoke(cublasSgetrfBatched, cublasDgetrfBatched, Manager::get()->cublasHandle(), n, - this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), mInfoLU.dataDevice(), - this->mBatchSize); + invoke(cublasSgetrfBatched, cublasDgetrfBatched, cublasCgetrfBatched, + cublasZgetrfBatched, Manager::get()->cublasHandle(), n, + DCAST(this->mBatch.dataDevice()), n, mPivotsBatch.dataDevice(), + mInfoLU.dataDevice(), this->mBatchSize); #else #pragma omp parallel for @@ -518,9 +547,10 @@ class PLUBatch : public Batch { uint32_t nrhs = b.shape().cols(); CUBLAS_CHECK( cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); - invoke(cublasSgetrsBatched, cublasDgetrsBatched, Manager::get()->cublasHandle(), - CUBLAS_OP_N, n, nrhs, this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), - b.batch().dataDevice(), n, &mInfoSolve, this->mBatchSize); + invoke(cublasSgetrsBatched, cublasDgetrsBatched, cublasCgetrsBatched, + cublasZgetrsBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, n, nrhs, + DCAST(this->mBatch.dataDevice()), n, mPivotsBatch.dataDevice(), + DCAST(b.batch().dataDevice()), n, &mInfoSolve, this->mBatchSize); #else #pragma omp parallel for @@ -554,46 +584,6 @@ class PLUBatch : public Batch { int32_t validSolve() const { return mInfoSolve == 0; } }; -// /** -// * Gets the inverse of each A[i], using an already PLU factorized A[i]. -// * Only available if compiling with CUDA. -// */ -// template -// void inverseBatch(const Array& batchA, const Array& batchC, const Array& -// pivots, -// const Array& info, const Shape shapeA, const Shape shapeC, -// const uint stream = 0) { -// #ifdef CUDA -// CT_ERROR_IF(shapeA.rows(), !=, shapeA.cols(), -// "'A' needs to be square, rows() and column need to match."); -// CT_ERROR_IF(shapeA.rows(), !=, shapeC.cols(), "'A' needs to be the same shape as -// 'C'."); CT_ERROR_IF(shapeA.rows(), !=, shapeC.rows(), "'A' needs to be the same shape -// as 'C'."); - -// CT_ERROR_IF(shapeA.rows(), !=, pivots.shape().rows(), -// "Rows()/columns of 'A' and rows() of pivots need to match."); -// CT_ERROR_IF(batchA.shape().rows(), !=, pivots.shape().cols(), -// "Batch size and columns of pivots need to match."); -// CT_ERROR_IF(info.shape().cols(), !=, 1, "Info needs to be a column vector.") -// CT_ERROR_IF(batchA.shape().rows(), !=, info.shape().rows(), -// "Batch size and length of info need to match."); -// CT_ERROR_IF(batchA.shape().rows(), !=, batchC.shape().rows(), -// "Batches 'A[i]' and 'C[i]' need to match."); - -// std::string s = "cublas" + std::to_string(stream); -// CUBLAS_CHECK( -// cublasSetStream(Manager::get()->cublasHandle(), -// Manager::get()->stream(s))); -// invoke(cublasSgetriBatched, cublasDgetriBatched, -// Manager::get()->cublasHandle(), -// shapeA.rows(), batchA.dataDevice(), shapeA.rows(), pivots.dataDevice(), -// batchC.dataDevice(), shapeC.rows(), info.dataDevice(), -// batchA.shape().rows()); -// #else -// CT_ERROR_IF(true, ==, true, "inverseBatch is not callable without CUDA."); -// #endif -// } - }; // namespace BLAS }; // namespace CudaTools diff --git a/Complex.h b/Complex.h new file mode 100644 index 0000000..e01235f --- /dev/null +++ b/Complex.h @@ -0,0 +1,125 @@ +#ifndef CUDATOOLS_COMPLEX_H +#define CUDATOOLS_COMPLEX_H + +#include "Macros.h" +#include +#include + +/** + * This is directly adapated from cuComplex.h, except placed into a C++ friendly format. + */ + +namespace CudaTools { + +template class complex { + private: + T r = 0; + T i = 0; + + public: + HD complex() = default; + HD complex(T real, T imag) : r(real), i(imag){}; + HD complex(T x) : r(x), i(0){}; + + HD complex operator+(const complex z) const { return complex(r + z.r, i + z.i); }; + HD complex operator-(const complex z) const { return complex(r - z.r, i - z.i); }; + HD complex operator*(const T y) const { return complex(r * y, i * y); }; + HD complex operator/(const T y) const { return complex(r / y, i / y); }; + + HD complex operator*(const complex z) const { + return complex(r * z.r - i * z.i, r * z.i + i * z.r); + }; + HD complex operator/(const complex z) const { + T s = std::abs(z.r) + std::abs(z.i); + T oos = 1.0f / s; + T ars = r * oos, ais = i * oos, brs = z.r * oos, bis = z.i * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0f / s; + return complex(ars * brs + ais * bis, ais * brs - ars * bis) * oos; + }; + + HD void operator+=(const complex z) { + r += z.r; + i += z.i; + }; + HD void operator-=(const complex z) { + r -= z.r; + i -= z.i; + }; + HD void operator*=(const T y) { + r *= y; + i *= y; + }; + HD void operator/=(const T y) { + r /= y; + i /= y; + }; + + HD void operator*=(const complex z) { + T a = r * z.r - i * z.i, b = r * z.i + i * z.r; + r = a; + i = b; + } + + HD void operator/=(const complex z) { + T s = std::abs(z.r) + std::abs(z.i); + T oos = 1.0f / s; + T ars = r * oos, ais = i * oos, brs = z.r * oos, bis = z.i * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0f / s; + r = (ars * brs + ais * bis) * oos; + i = (ais * brs - ars * bis) * oos; + }; + + HD T abs() const { + T a = std::abs(r), b = std::abs(i); + T v, w; + if (a > b) { + v = a; + w = b; + } else { + v = b; + w = a; + } + T t = w / v; + t = 1.0f + t * t; + t = v * std::sqrt(t); + if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) { + t = v + w; + } + return t; + } + + HD complex conj() const { return complex(r, -1 * i); } + + HD T real() const { return r; }; + HD T imag() const { return i; }; +}; + +template class complex; +template class complex; + +template complex operator*(const T y, const complex z) { return z * y; }; +template complex operator/(const T y, const complex z) { return z / y; }; + +template complex operator*(const real32, const complex); +template complex operator*(const real64, const complex); +template complex operator/(const real32, const complex); +template complex operator/(const real64, const complex); + +}; // namespace CudaTools + +#ifdef CUDA +using complex64 = CudaTools::complex; +using complex128 = CudaTools::complex; +#else +using complex64 = std::complex; /**< Type alias for 64-bit complex floating point datatype. + * This adapts depending on the CUDA compilation flag, and + * will automatically switch CudaTools::complex. */ +using complex128 = + std::complex; /**< Type alias for 128-bit complex floating point datatype. This adapts + * depending on the CUDA compilation flag, and will automatically switch + * CudaTools::complex. */ +#endif + +#endif diff --git a/Macros.h b/Macros.h index 4ffce1a..3c9d467 100644 --- a/Macros.h +++ b/Macros.h @@ -9,6 +9,9 @@ #define CUDACC #endif +using real32 = float; /**< Type alias for 32-bit floating point datatype. */ +using real64 = double; /**< Type alias for 64-bit floating point datatype. */ + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) #define DEVICE #endif diff --git a/Makefile.template b/Makefile.template new file mode 100644 index 0000000..48f72f8 --- /dev/null +++ b/Makefile.template @@ -0,0 +1,95 @@ +CC := g++-10 +NVCC := nvcc +CFLAGS := -Wall -std=c++17 -fopenmp -MMD +NVCC_FLAGS := -MMD -w -Xcompiler + +INCLUDE := <> +LIBS_DIR := <> +LIBS_DIR_GPU := /usr/local/cuda/lib64 <> +LIBS := <> +LIBS_GPU := cuda cudart cublas <> + +TARGET = <> +SRC_DIR = . +BUILD_DIR = build + +# Should not need to modify below. + +CPU_BUILD_DIR = $(BUILD_DIR)/cpu +GPU_BUILD_DIR = $(BUILD_DIR)/gpu + +SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp) + +# Get source files and object files. +GCC_SRC = $(filter-out %.cu.cpp ,$(SRC)) +NVCC_SRC = $(filter %.cu.cpp, $(SRC)) +GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o) +NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o) + +# If compiling for CPU, all go to GCC. Otherwise, they are split. +CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ)) +GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ)) +GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ)) + +# $(info $$GCC_SRC is [${GCC_SRC}]) +# $(info $$NVCC_SRC is [${NVCC_SRC}]) +# $(info $$GCC_OBJ is [${GCC_OBJ}]) +# $(info $$NVCC_OBJ is [${NVCC_OBJ}]) + +# $(info $$CPU_OBJ is [${CPU_OBJ}]) +# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}]) +# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}]) + +HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h) +CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d) +GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d) + +INC := $(INCLUDE:%=-I%) +LIB := $(LIBS_DIR:%=-L%) +LIB_GPU := $(LIBS_DIR_GPU:%=-L%) +LD := $(LIBS:%=-l%) +LD_GPU := $(LIBS_GPU:%=-l%) + +# Reminder: +# $< = first prerequisite +# $@ = the target which matched the rule +# $^ = all prerequisites + +.PHONY: all clean + +all : cpu gpu + +cpu: $(TARGET)CPU +gpu: $(TARGET)GPU + +$(TARGET)CPU: $(CPU_OBJ) + $(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS) + +$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) + $(CC) $(CFLAGS) -c -o $@ $< $(INC) + +# For GPU, we need to build the NVCC objects, the NVCC linked object, and the +# regular ones. Then, we link them all together. +$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) + $(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU) + +$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) + $(NVCC) --device-link $^ -o $@ + +$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) + $(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC) + +$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) + $(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC) + +-include $(CPU_DEPS) +-include $(GPU_DEPS) + +$(CPU_BUILD_DIR): + mkdir -p $@ + +$(GPU_BUILD_DIR): + mkdir -p $@ + +clean: + rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU diff --git a/docs/source/core.rst b/docs/source/core.rst index d6e2874..f698e4d 100644 --- a/docs/source/core.rst +++ b/docs/source/core.rst @@ -2,12 +2,20 @@ Core.h ====== -The ``Core.h`` header file defines several compiler flags and macros along with +The ``Core.h`` header file defines some useful types and some macros along with a few core classes. -Flags +Types ===== +.. doxygentypedef:: real32 +.. doxygentypedef:: real64 +.. doxygentypedef:: complex64 +.. doxygentypedef:: complex128 + +Macro Definitions +================= + Device Indicators ----------------- .. doxygendefine:: CUDACC @@ -22,8 +30,8 @@ Compilation Options ------------------- .. doxygendefine:: CUDATOOLS_ARRAY_MAX_AXES -Macros -====== +Macro Functions +=============== Kernel ------ diff --git a/docs/source/usage.rst b/docs/source/usage.rst index fdd26a8..372e557 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -10,6 +10,7 @@ compilation and linking framework: #. :ref:`Array Examples` #. :ref:`BLAS Examples` #. :ref:`Compilation and Linking` +#. :ref:`Notes` The ``Core.h`` header contains the necessary macros, flags and objects for interfacing with basic kernel launching and the CUDA Runtime API. The ``Array.h`` header contains the ``CudaTools::Array`` @@ -47,7 +48,7 @@ kernel. The launch parameters have several items, but for 'embarassingly paralle cases, we can simply generate the settings with the number of threads. More detail with creating launch parameters can be found :ref:`here `. In the above example, there is only one thread. The rest of the arguments are just the kernel arguments. For more detail, -see :ref:`here `. +see :ref:`here `. .. warning:: These kernel definitions must be in a file that will be compiled by ``nvcc``. Also, @@ -297,3 +298,20 @@ file for the first example: The lines above are the first few lines of the ``Makefile``, which are the only lines you should need to modify, consisting of libraries and flags, as well as the name of the target. + +Notes +===== + +Complex Numbers +--------------- +Dealing with complex numbers is slightly complicated, trying to enforce compatability between +two systems and several different libraries which many not have the right support. We +create a simple barebones host and device compatible complex number class following +the same as ``cuComplex.h``, but with proper C++ operator overloading and class structure. However, +while the underlying data structure is identical to all other complex number structures, there +is a lot of type-casting done underneath the hood to get cuBLAS and Eigen to work well +together, while maintaining one 'unified' complex type. + +As a result, there could be some issues and lack of functionality with this at the moment. +For now, it's recommended to use the given ``complex64`` and ``complex128`` types which +should properly adapt and work. diff --git a/tests.cu.cpp b/tests.cu.cpp index d3e179c..9466c32 100644 --- a/tests.cu.cpp +++ b/tests.cu.cpp @@ -2,6 +2,7 @@ #define CUDATOOLS_ARRAY_MAX_AXES 8 #include "Array.h" #include "BLAS.h" +#include "Complex.h" #include "Core.h" #include @@ -47,8 +48,10 @@ template struct Type; REGISTER_PARSE_TYPE(uint8_t); REGISTER_PARSE_TYPE(int16_t); REGISTER_PARSE_TYPE(int32_t); -REGISTER_PARSE_TYPE(float); -REGISTER_PARSE_TYPE(double); +REGISTER_PARSE_TYPE(real32); +REGISTER_PARSE_TYPE(real64); +REGISTER_PARSE_TYPE(complex64); +REGISTER_PARSE_TYPE(complex128); std::string box(std::string str) { std::string tops(str.size() + 6, '#'); @@ -433,6 +436,8 @@ template struct BLASTests { template <> double BLASTests::thres = 10e-1; template <> double BLASTests::thres = 10e-8; +template <> double BLASTests::thres = 10e-1; +template <> double BLASTests::thres = 10e-8; uint32_t doMacroTests() { uint32_t failed = 0; @@ -478,13 +483,15 @@ int main() { failed += doArrayTests(); failed += doArrayTests(); failed += doArrayTests(); - failed += doArrayTests(); + failed += doArrayTests(); std::cout << box("BLAS Tests") << "\n"; - failed += doBLASTests(); - failed += doBLASTests(); + failed += doBLASTests(); + failed += doBLASTests(); + failed += doBLASTests(); + failed += doBLASTests(); - constexpr uint32_t tests = 2 + 4 * 5 + 13 * 2; + constexpr uint32_t tests = 2 + 4 * 5 + 13 * 4; std::ostringstream msg; msg << ((failed == 0) ? "\033[1;32mPASS \033[0m(" : "\033[1;31mFAIL \033[0m(") << (tests - failed) << "/" << tests << ")";