commit b4e4a49d44b7ea1671c022d29ce6d24d058c9f74 Author: Kenneth Jao Date: Tue Apr 18 03:23:12 2023 -0500 Initial commit with first version of library diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..92cb202 --- /dev/null +++ b/.clang-format @@ -0,0 +1,10 @@ +--- +BasedOnStyle: LLVM +IndentWidth: 4 +ColumnLimit: 100 +AllowShortIfStatementsOnASingleLine: true +--- +Language: Cpp +DerivePointerAlignment: false +PointerAlignment: Left +--- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b9062a2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +build +*CPU +*GPU +.venv diff --git a/Array.h b/Array.h new file mode 100644 index 0000000..69b9a06 --- /dev/null +++ b/Array.h @@ -0,0 +1,777 @@ +#ifndef ARRAY_H +#define ARRAY_H + +#include "Core.h" +#include "Macros.h" +#include +#include +#include +#include +#include + +#ifdef DEVICE +#define POINTER pDevice +#else +#define POINTER pHost +#endif + +namespace CudaTools { + +template +using EigenMat = Eigen::Matrix; +template using EigenMapMat = Eigen::Map>; +template using ConstEigenMapMat = Eigen::Map>; + +template struct EigenAdaptConst { typedef EigenMapMat type; }; +template struct EigenAdaptConst { typedef ConstEigenMapMat type; }; + +#define ENABLE_IF(X) std::enable_if_t +#define IS_INT(T) std::is_integral::value +#define IS_FLOAT(T) std::is_floating_point::value +#define IS_NUM(T) IS_INT(T) or IS_FLOAT(T) + +template class Array; +using Slice = std::pair; + +template class ArrayIterator { + private: + template + friend std::ostream& operator<<(std::ostream& out, const ArrayIterator& it); + T* pData; + Shape mShape; + uint32_t mIndices[CUDATOOLS_ARRAY_MAX_AXES] = {0}; + + public: + HD ArrayIterator(T* p, const Shape& shape) : pData(p), mShape(shape){}; + + /** + * Moves the iterator to the next value. + */ + HD void next() { + bool carry = false; + uint32_t offset = 0; + for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { + if (mIndices[iAxis] == mShape.dim(iAxis) - 1) { + mIndices[iAxis] = 0; + offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); + carry = true; + } else { + pData += mShape.stride(iAxis); + mIndices[iAxis] += 1; + carry = false; + } + + if (not carry) { + pData -= offset; + return; + } + } + pData += 1; // "Overflow" occured, so we reached end of array. + } + + /** + * Moves the iterator to the previous value. + */ + HD void prev() { + bool carry = false; + uint32_t offset = 0; + for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) { + if (mIndices[iAxis] == 0) { + mIndices[iAxis] = mShape.dim(iAxis) - 1; + offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1); + carry = true; + } else { + pData -= mShape.stride(iAxis); + mIndices[iAxis] += 1; + carry = false; + } + if (not carry) { + pData += offset; + return; + } + } + pData -= 1; + } + + /** + * Moves the iterator a specified value away. + * \param amount the amount to advance by + */ + HD void advance(const int32_t amount) { + if (amount < 0) { + for (uint32_t i = 0; i < abs(amount); ++i) { + prev(); + } + } else { + for (uint32_t i = 0; i < abs(amount); ++i) { + next(); + } + } + } + + HD void operator++() { next(); }; /**< Prefix increment operator. */ + HD void operator--() { prev(); }; /**< Prefix decrement operator. */ + + /**< Addition operator. */ + HD ArrayIterator operator+(const int32_t v) const { + ArrayIterator it = *this; + it.advance(v); + return it; + }; + + /** Subtraction operator.*/ + HD ArrayIterator operator-(const int32_t v) const { + ArrayIterator it = *this; + it.advance(-v); + return it; + }; + HD void operator+=(const int32_t v) { advance(v); }; + HD void operator-=(const int32_t v) { advance(-v); }; + + HD T& operator*() { return *pData; }; /**< Dereference operator. */ + HD const T& operator*() const { return *pData; }; /**< Const dereference operator. */ + + /** + * Equals operator. + */ + HD bool operator==(const ArrayIterator& it) { return pData == it.pData; } + + /** + * Not equals operator. + */ + HD bool operator!=(const ArrayIterator& it) { return pData != it.pData; } +}; + +template std::ostream& operator<<(std::ostream& out, const ArrayIterator& it) { + return out << it.pData; +} + +template class ArrayLoader { + private: + ArrayIterator mIterator; + ArrayIterator mIteratorEnd; + + public: + HD ArrayLoader(const ArrayIterator& it, const ArrayIterator& it_end) + : mIterator(it), mIteratorEnd(it_end){}; + HD ArrayLoader &operator,(const T value) { + CT_ERROR_IF(mIterator, ==, mIteratorEnd, "Cannot assign more values than Array size"); + *mIterator = value; + ++mIterator; + return *this; + } +}; + +/** + * A container that holds a N-dimensional array, stored column major. To set the + * maximum N, there is a compiler macro CUDATOOLS_ARRAY_MAX_DIM whose default value is 4. + * It adapts to operations between host and device to ease memory management. + */ +template class Array { + private: + template friend std::ostream& operator<<(std::ostream&, const Array&); + + Shape mShape; + T* pHost = nullptr; + T* pDevice = nullptr; + + bool mIsView = false; + bool mIsSlice = false; + + uint32_t mEndOffset = 0; + + void freeArrays() { +#ifndef DEVICE + if (not mIsView) { + if (pDevice != nullptr) CudaTools::free(pDevice); + if (pHost != nullptr) delete[] pHost; + } +#endif + }; + + HD void calcEnd() { + uint32_t offset = 0; + for (uint32_t i = 0; i < shape().axes(); ++i) { + offset += (shape().dim(i) - 1) * shape().stride(i); + } + mEndOffset = offset + 1; + }; + + public: + HD Array() = default; + + /** + * Constructor for an Array that creates an allocates an array with + * the specified Shape. Construction in this format is disabled on the device. + * \brief Host only + * \param shape the shape of the array + * \param noDevice whether to initialize the array on the device + */ + Array(const Shape& shape, const bool noDevice = false) : mShape(shape), mIsView(false) { + pHost = new T[shape.items()]; + calcEnd(); + if (noDevice) return; + pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); + }; + + /** + * Constructor for an Array from an existing (preallocated) pointer. + * \param pointer the pointer to use + * \param shape the shape of the array + * \param noDevice whether to initialize the array on the device + */ + HD Array(T* const pointer, const Shape& shape, const bool noDevice = false) + : mShape(shape), mIsView(true), mIsSlice(false) { + POINTER = pointer; + calcEnd(); +#ifndef DEVICE + if (noDevice) return; + pDevice = (T*)CudaTools::malloc(shape.items() * sizeof(T)); +#endif + }; + + /** + * Constructor for making a Array view from another Array, + * given an offset and shape. + * \param arr the original Array + * \param shape the shape of the new array + * \param offset the index where to start the a view of the array + */ + HD Array(const Array& arr, const Shape& shape, const uint32_t offset = 0) + : mShape(shape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(true), + mIsSlice(arr.mIsSlice) { + calcEnd(); + if (pHost != nullptr) pHost += offset; + if (pDevice != nullptr) pDevice += offset; + }; + + /** + * The copy-constructor for a Array. If this is not a view, a deep copy + * of the data will be performed on both host and device. On the device, it is always + * treated like a view. + */ + HD Array(const Array& arr) : mShape(arr.mShape), mIsView(arr.mIsView), mIsSlice(arr.mIsSlice) { + calcEnd(); + if (mIsView) { // If the other array was a view (and now this one), just assign. + pHost = arr.pHost; + pDevice = arr.pDevice; + return; + } + + // Otherwise, we assume this is needs to own data. + pHost = new T[mShape.items()]; + auto arr_it = arr.begin(); + for (auto it = begin(); it != end(); ++it) { + *it = *arr_it; + ++arr_it; + } + +#ifndef DEVICE + if (arr.pDevice != nullptr) { + pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); + } +#endif + }; + + /** + * The move-constructor for a Array. + */ + HD Array(Array&& arr) + : mShape(arr.mShape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(arr.mIsView), + mIsSlice(arr.mIsSlice) { + calcEnd(); + // Make other object empty. + arr.pHost = nullptr; + arr.pDevice = nullptr; + arr.mIsView = true; + }; + + HD ~Array() { freeArrays(); }; + + /** + * The copy-assignment operator for a Array. If this is not a view, + * then the currently owned data will be freed, and a deep copy of the data will + * be performed on both host and device. On the device, it is always treated like a view. + */ + HD Array& operator=(const Array& arr) { + if (this == &arr) return *this; + + if (mIsView) { // If this array is a view, we assign data from the right-hand side. + auto arr_it = arr.begin(); + for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { + *it = *arr_it; + ++arr_it; + } + return *this; + } + + // Otherwise, it is implied to be object reassignment. + mShape = arr.mShape; + mIsView = arr.mIsView; + mIsSlice = arr.mIsSlice; + calcEnd(); + + // Regardless if the right-hand side is a view, we create a new copy. + // In case that the right-hand side is a view of this array, we + // allocate memory to copy first. Keep in mind that the right-hand side + // array will then become undefined. + + // We can only do this on the host. +#ifndef DEVICE + T* new_pDevice = nullptr; + if (pDevice != nullptr) { + new_pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T)); + } + + T* new_pHost = new T[mShape.items()]; + memcpy(new_pHost, arr.pHost, mShape.items() * sizeof(T)); + + freeArrays(); + pHost = new_pHost; + pDevice = new_pDevice; +#else + pHost = arr.pHost; + pDevice = arr.pDevice; +#endif + return *this; + }; + + /** + * The move-assignment operator for a Array. + */ + HD Array& operator=(Array&& arr) { + if (this == &arr) return *this; + + if (mIsView) { // If this array is a view, we assign data from the right-hand side. + auto arr_it = arr.begin(); + for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) { + *it = *arr_it; + ++arr_it; + } + return *this; + } + + CT_ERROR(arr.mIsView, + "Cannot move-assign view to a non-view (owner). This would lead to undefined " + "behavior."); + + // Otherwise, it is implied to be object reassignment. + freeArrays(); + mShape = arr.mShape; + pHost = arr.pHost; + pDevice = arr.pDevice; + mIsView = arr.mIsView; + mIsSlice = arr.mIsSlice; + calcEnd(); + + // Make other array empty. + arr.pHost = nullptr; + arr.pDevice = nullptr; + arr.mIsView = true; + return *this; + }; + + /** + * Used for indexing the Array. + * \param index index of the first dimension + */ + HD Array operator[](const uint32_t index) const { + CT_ERROR_IF(index, >=, shape().dim(0), "Index exceeds axis size"); + return Array(*this, shape().subshape(1), index * shape().stride(0)); + }; + + /** + * Used for indexing the Array. + * \param indices a list of indices to index the Array + */ + HD Array operator[](const std::initializer_list indices) const { + CT_ERROR_IF(indices.size(), >, shape().axes(), + "Number of indices cannot exceed number of axes"); + auto it = indices.begin(); + uint offset = 0; + for (uint32_t i = 0; i < indices.size(); ++i) { + uint32_t index = *it; + CT_ERROR_IF(index, >=, shape().dim(i), "Index exceeds axis size"); + offset += index * shape().stride(i); + ++it; + } + return Array(*this, shape().subshape(indices.size()), offset); + }; + + HD ArrayLoader operator<<(const T value) { + auto it = begin(); + *it = value; + ++it; + return ArrayLoader(it, end()); + }; + + HD T operator=(const T& value) { return POINTER[0] = value; }; + HD operator T&() { return POINTER[0]; }; + HD operator const T&() const { return POINTER[0]; }; + + /** + * Used to create slices of the Array. + * \param slices a list of slices to slice the Array + */ + HD Array slice(const std::initializer_list slices) const { + CT_ERROR_IF(slices.size(), >, shape().axes(), + "Number of slices cannot exceed number of axes"); + + uint offset = 0; + Shape new_shape = mShape; + auto it = slices.begin(); + for (uint32_t i = 0; i < slices.size(); ++i) { + uint32_t from_index = it->first; + uint32_t to_index = it->second; + CT_ERROR_IF(from_index, >, to_index, + "Slice start cannot be greater than than slice end"); + CT_ERROR_IF(from_index, >=, shape().dim(i), "Slice start exceeds axis size"); + CT_ERROR_IF(to_index - 1, >=, shape().dim(i), "Slice end exceeds axis size"); + + offset += from_index * shape().stride(i); + new_shape.mAxisDim[i] = to_index - from_index; + ++it; + } + new_shape.mItems = 1; + for (uint32_t i = 0; i < shape().axes(); ++i) { + new_shape.mItems *= new_shape.dim(i); + } + + Array arr(*this, new_shape, offset); + arr.mIsSlice = true; + return arr; + }; + + /** + * Returns this Array with a different Shape. Its self assigning version is reshape. + * If this Array is a slice of another, then it will perform a deep copy, and return + * a new non-view array. + */ + HD Array reshaped(const Shape& new_shape) const { + CT_ERROR_IF(shape().items(), !=, new_shape.items(), + "New shape cannot have a different number of terms"); + if (mIsSlice) { + Array arr = this->copy(); + return arr.reshaped(new_shape); + } + Array arr = view(); + arr.mShape = new_shape; + return arr; + }; + + HD void reshape(const Shape& new_shape) { + CT_ERROR_IF(shape().items(), !=, new_shape.items(), + "New shape cannot have a different number of terms"); + CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try reshaped instead)") + mShape = new_shape; + }; + + /** + * Gets a view that is has at least two dimensions. Useful for promoting + * single vectors to their 2D counterparts. + */ + HD Array atLeast2D() const { + return (shape().axes() == 1) ? Array(*this, {shape().length(), 1}) : view(); + }; + + /** + * Flattens the Array into one dimension. + */ + HD Array flatten() const { return reshape({mShape.mItems}); }; + + /** + * Returns the Eigen::Map of this Array. + */ + typename EigenAdaptConst::type eigenMap() const { + uint32_t total_dim = mShape.mAxes; + CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices") + CT_ERROR_IF(total_dim, !=, 2, + "Mapping to an Eigen array can only occur on two-dimensional arrays"); + return typename EigenAdaptConst::type(POINTER, mShape.rows(), mShape.cols()); + }; + + /** + * Gets the Shape of the Array. + */ + HD Shape shape() const { return mShape; }; + + /** + * Gets the pointer to this array, depending on host or device. + */ + HD T* data() const { return POINTER; }; + + /** + * Returns the device pointer regardless of host or device. + */ + HD T* dataDevice() const { return pDevice; }; + + HD bool isView() const { return mIsView; }; /**< Gets whether this Array is a view. */ + HD bool isSlice() const { return mIsSlice; }; /**< Gets whether this Array is a slice. */ + + /** + * Gets a view of this Array. + */ + HD Array view() const { return Array(*this, mShape); } + + /** + * Copies this Array and returns a new Array with the same memory. + */ + HD Array copy() const { + Array arr(mShape, (pDevice == nullptr)); + + auto arr_it = arr.begin(); + for (auto it = begin(); it != end(); ++it) { + *arr_it = *it; + ++arr_it; + } +#ifndef DEVICE + if (pDevice != nullptr) { + CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait(); + } +#endif + return arr; + }; + + /** + * Gets the iterator to the beginning of this Array. + */ + HD ArrayIterator begin() const { return ArrayIterator(POINTER, mShape); }; + + /** + * Gets the iterator to the end of this Array. + */ + HD ArrayIterator end() const { return ArrayIterator(POINTER + mEndOffset, mShape); }; + + /** + * Sets the values of the entire Array to a constant. This is restricted to numerical types. + */ + HD void setConstant(const T value) const { + static_assert(IS_NUM(T), "Function only available on numeric types."); + for (auto it = begin(); it != end(); ++it) { + *it = value; + } + }; + + /** + * Sets the Array values with uniform random values in a specified range. This is restricted to + * numerical types. + * \brief Host only + */ + void setRandom(const T min, const T max) const { + static_assert(IS_NUM(T), "Function only available on numeric types."); + CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); + std::random_device rd; + std::mt19937 mt(rd()); + if constexpr (IS_INT(T)) { + std::uniform_int_distribution dist(min, max); + for (auto it = begin(); it != end(); ++it) { + *it = dist(mt); + } + } else if constexpr (IS_FLOAT(T)) { + std::uniform_real_distribution dist(min, max); + for (auto it = begin(); it != end(); ++it) { + *it = dist(mt); + } + } + }; + + /** + * Sets the Array values to start from a value and increment by a specified step. This is + * restricted to numerical types. + */ + HD void setRange(T min, const T step = 1) const { + static_assert(IS_NUM(T), "Function only available on numeric types."); + for (auto it = begin(); it != end(); ++it) { + *it = min; + min += step; + } + } + /** + * Sets the Array values to be evenly spaced numbers over a given interval. This is restricted + * to floating point types. + */ + HD void setLinspace(const T min, const T max) const { + static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); + CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); + T i = 0; + T d = max - min; + T items = (T)(shape().items() - 1); + for (auto it = begin(); it != end(); ++it) { + *it = min + d * (i / items); + i += 1; + } + }; + + /** + * Returns array of given shape with constant values. This is restricted to numerical types. + * \brief Host only + */ + static Array constant(const Shape& shape, const T value) { + static_assert(IS_NUM(T), "Function only available on numeric types."); + Array arr(shape); + arr.setConstant(value); + return arr; + }; + + /** + * Returns array of given shape with random values in given interval. This is restricted to + * numerical types. + * \brief Host only + */ + static Array random(const Shape& shape, const T min, const T max) { + static_assert(IS_NUM(T), "Function only available on numeric types."); + Array arr(shape); + arr.setRandom(min, max); + return arr; + }; + + /** + * Returns evenly spaced values within a given interval. This is restricted to numerical types. + * \brief Host only + */ + static Array range(const T min, const T max, const T step = 1) { + static_assert(IS_NUM(T), "Function only available on numeric types."); + CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound"); + Array arr({(uint32_t)((max - min) / step)}); + arr.setRange(min, step); + return arr; + } + + /** + * Returns evenly spaced values within a given interval. This is restricted to floating point + * types. + * \brief Host only + */ + static Array linspace(const T min, const T max, const uint32_t size) { + static_assert(IS_FLOAT(T), "Function only available on numeric floating types."); + Array arr({size}); + arr.setLinspace(min, max); + return arr; + } + + /** + * Transposes the internal data and returns the corresponding new Array. + * Its self assigning version is transpose. This is restricted to numerical types. + * \brief Host only + */ + Array transposed() const { + static_assert(IS_NUM(T), "Function only available on numeric types."); + CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); + Array new_arr({mShape.rows(), mShape.cols()}); + new_arr.eigenMap() = this->eigenMap().transpose().eval(); + return new_arr; + }; + + /** + * Transposes the intenal data. Its self assigning version is transpose. + * This is restricted to numerical types. + * \brief Host only + */ + void transpose() { + static_assert(IS_NUM(T), "Function only available on numeric types."); + CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays"); + Array new_arr(*this, {mShape.cols(), mShape.rows()}); + new_arr.eigenMap() = this->eigenMap().transpose().eval(); + mShape = Shape({mShape.cols(), mShape.rows()}); + }; + + void inverse() const { + static_assert(IS_FLOAT(T), "Function only available on floating numeric types."); + CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays"); + CT_ERROR_IF(shape().rows(), !=, shape().cols(), + "Inverse can only occur on square matrices"); + Array inv(shape()); + inv.eigenMap() = this->eigenMap().inverse(); + }; + + /** + * Pins the memory (page locks) for faster memory transfer in concurrent + * transfers. + * \brief Host only + */ + void pinMemory() const { CudaTools::pin(pHost, mShape.items() * sizeof(T)); }; + + /** + * Updates the host copy by copying the device data back to the host. + * \brief Host only + */ + StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const { + CT_ERROR(mIsView, "Cannot update host on a view"); + CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream); + return stream; + }; + + /** + * Updates the device copy by copying the host data to the device. + * \brief Host only + */ + StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const { + CT_ERROR(mIsView, "Cannot update device on a view"); + CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream); + return stream; + }; +}; + +template +void printAxis(std::ostream& out, const Array& arr, const uint32_t axis, size_t width) { + std::string space = std::string(2 * axis, ' '); + if (arr.shape().axes() == 1) { + out << "["; + for (uint32_t i = 0; i < arr.shape().items(); ++i) { + if constexpr (std::is_floating_point::value) { + out << std::scientific << std::setprecision(6); + } + if (width == 0) { + out << ((i == 0) ? "" : " "); + } else { + out << std::setw((i == 0) ? width - 1 : width); + } + out << (T)arr[i] << ((i == arr.shape().items() - 1) ? "]" : ","); + } + } else if (arr.shape().axes() == 2) { + for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { + out << space << ((i == 0) ? "[" : " "); + printAxis(out, arr[i], axis + 1, width); + out << ((i == arr.shape().dim(0) - 1) ? "]" : ",\n"); + } + } else { + out << space << "[\n"; + for (uint32_t i = 0; i < arr.shape().dim(0); ++i) { + printAxis(out, arr[i], axis + 1, width); + out << ((i == arr.shape().dim(0) - 1) ? "\n" : ",\n\n"); + } + out << space << "]"; + } +} + +template std::ostream& operator<<(std::ostream& out, const Array& arr) { + size_t width = 0; + if constexpr (IS_NUM(T)) { + T max_val = 0; + bool negative = false; + for (auto it = arr.begin(); it != arr.end(); ++it) { + if (*it < 0) negative = true; + max_val = (abs(*it) > max_val) ? abs(*it) : max_val; + } + width = std::to_string(max_val).size() + 1; + width += (negative) ? 1 : 0; + } else if constexpr (IS_FLOAT(T)) { + T max_val = 0; + bool negative = false; + for (auto it = arr.begin(); it != arr.end(); ++it) { + if (*it < 0) negative = true; + int exp = 0; + frexp(*it, &exp); + max_val = (exp > max_val) ? exp : max_val; + } + width = std::to_string(max_val).size() + 5; + width += (negative) ? 1 : 0; + } + + printAxis(out, arr, 0, (arr.shape().axes() == 1) ? 0 : width); + return out; +} + +}; // namespace CudaTools + +#endif // ARRAY_H diff --git a/BLAS.h b/BLAS.h new file mode 100644 index 0000000..b79f4e5 --- /dev/null +++ b/BLAS.h @@ -0,0 +1,600 @@ +#ifndef BLAS_H +#define BLAS_H + +#include "Array.h" +#include "Core.h" +#include "Macros.h" + +namespace CudaTools { + +namespace BLAS { + +struct BatchInfo { + uint32_t strideA, strideB, strideC; + uint32_t size; +}; + +template struct Check { + static void isAtLeast2D(const Array& arr, const std::string& name = "Array") { + CT_ERROR_IF(arr.shape().axes(), <, 2, (name + " needs to be at least 2D").c_str()); + }; + + static void isSquare(const Array& arr, const std::string& name = "Array") { + isAtLeast2D(arr, name); + CT_ERROR_IF(arr.shape().rows(), !=, arr.shape().cols(), (name + " is not square").c_str()) + }; + + static void isValidMatmul(const Array& A, const Array& B, const Array& C, + const std::string& nameA = "A", const std::string& nameB = "B", + const std::string nameC = "C") { + isAtLeast2D(A, nameA); + isAtLeast2D(B, nameB); + isAtLeast2D(C, nameB); + CT_ERROR_IF(A.shape().cols(), !=, B.shape().rows(), + (nameA + nameB + " is not a valid matrix multiplication").c_str()); + + Shape ABshape({A.shape().rows(), B.shape().cols()}); + Shape Cshape({C.shape().rows(), C.shape().cols()}); + + CT_ERROR_IF( + ABshape, !=, Cshape, + ("The shape of " + nameA + nameB + " does not match the shape of " + nameC).c_str()); + }; + + static uint32_t getUpperItems(const Array& arr) { + uint32_t upperItems = 1; + for (uint32_t iAxis = 0; iAxis < arr.shape().axes() - 2; ++iAxis) { + upperItems *= arr.shape().dim(iAxis); + } + return upperItems; + }; + + static void matchUpperShape(const Array& A, const Array& B, + const std::string& nameA = "A", const std::string& nameB = "B") { + CT_ERROR_IF(A.shape().axes(), !=, B.shape().axes(), + (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); + for (uint32_t iAxis = 0; iAxis < A.shape().axes() - 2; ++iAxis) { + uint32_t Adim = A.shape().dim(iAxis); + uint32_t Bdim = B.shape().dim(iAxis); + CT_ERROR_IF( + Adim, !=, Bdim, + (nameA + " and " + nameB + " shapes do not match for broadcasting").c_str()); + } + }; + + static BatchInfo isBroadcastable(const Array& A, const Array& B, const Array& C, + const std::string& nameA = "A", const std::string& nameB = "B", + const std::string nameC = "C") { + isValidMatmul(A, B, C, nameA, nameB, nameC); + uint32_t itemsA = getUpperItems(A); + uint32_t itemsB = getUpperItems(B); + uint32_t itemsC = getUpperItems(C); + + uint32_t Asize = A.shape().rows() * A.shape().cols(); + uint32_t Bsize = B.shape().rows() * B.shape().cols(); + uint32_t Csize = C.shape().rows() * C.shape().cols(); + + if (itemsA == itemsB) { + CT_ERROR_IF(itemsA, !=, itemsC, + ("Incorrect dimensions to broadcast to output " + nameC).c_str()); + matchUpperShape(A, B, nameA, nameB); + matchUpperShape(A, C, nameA, nameC); + return BatchInfo{Asize, Bsize, Csize, itemsC}; + } else if (itemsA > itemsB) { + CT_ERROR_IF( + itemsB, !=, 1, + ("Cannot broadcast operation to " + nameB + " with non-matching " + nameA).c_str()); + CT_ERROR_IF(itemsA, !=, itemsC, + ("Incorrect dimensions to broadcast to output " + nameC).c_str()); + matchUpperShape(A, C, nameA, nameC); + return BatchInfo{Asize, 0, Csize, itemsC}; + } else { + CT_ERROR_IF( + itemsA, !=, 1, + ("Cannot broadcast operation to " + nameA + " with non-matching " + nameB).c_str()); + CT_ERROR_IF(itemsA, !=, itemsC, + ("Incorrect dimensions to broadcast to output " + nameC).c_str()); + matchUpperShape(B, C, nameB, nameC); + return BatchInfo{0, Bsize, Csize, itemsC}; + } + }; +}; + +/** + * Represents a Batch of Arrays with the same shape. Mainly used for cuBLAS functions. + */ +template class Batch { + protected: + Array mBatch; + Shape mShape; + + uint32_t mCount = 0; + uint32_t mBatchSize; + + public: + Batch() = delete; + + /** + * Constructs a batch from a given size. + */ + Batch(const uint32_t size) : mBatchSize(size){}; + + /** + * Constructs a batch from a non-view Array. + */ + Batch(const Array& arr) { + CT_ERROR(arr.isView(), "Array cannot be a view"); + mShape = Shape({arr.shape().rows(), arr.shape().cols()}); + mBatchSize = mCount = Check::getUpperItems(arr); + + mBatch = Array({mBatchSize}); + + Array batch = arr.reshaped({mBatchSize, mShape.rows(), mShape.cols()}); + for (uint32_t i = 0; i < mBatchSize; ++i) { +#ifdef CUDA + mBatch[i] = batch[i].dataDevice(); +#else + mBatch[i] = batch[i].data(); +#endif + } + + mBatch.updateDevice().wait(); + }; + + /** + * Adds a matrix to the batch. Array must be a view. + */ + void add(const Array& arr) { + CT_ERROR(not arr.isView(), "Cannot add non-view Arrays"); + CT_ERROR_IF(mCount, ==, mBatchSize, "Batch is full, cannot add more arrays"); +#ifdef CUDA + mBatch[mCount] = arr.dataDevice(); +#else + mBatch[mCount] = arr.data(); +#endif + if (mCount == 0) { + mShape = arr.shape(); + mBatchSize = mCount = Check::getUpperItems(arr); + } else { + CT_ERROR_IF(arr.shape(), !=, mShape, "Cannot add matrix of different shape to batch"); + } + ++mCount; + + if (mCount == mBatchSize) { + mBatch.updateDevice().wait(); + } + }; + + /** + * Indexing operator which returns a view of the Array in the Batch at the given index. + */ + Array operator[](const uint32_t index) const { + CT_ERROR_IF(index, >=, mBatchSize, "Index exceeds batch size"); + return Array(mBatch[index], {mShape.rows(), mShape.cols()}); + }; + + /** + * Returns the batch Array of pointers. + */ + Array batch() const { return mBatch.view(); }; + Shape shape() const { return mShape; } /**< Gets the shape of the matrices in the batch. */ + uint32_t size() const { return mBatchSize; } /**< Gets the batch size.*/ + bool full() const { return mBatchSize == mCount; }; /**< Gets if the batch is full. */ +}; + +//////////////// +// cuBLAS API // +//////////////// + +template +constexpr void invoke(F1 f1, F2 f2, Args&&... args) { + if constexpr (std::is_same::value) { + CUBLAS_CHECK(f1(args...)); + } else if constexpr (std::is_same::value) { + CUBLAS_CHECK(f2(args...)); + } else { + CT_ERROR(true, "BLAS functions are not callable with that type"); + } +} + +/** + * Computes the matrix-vector product: \f$ y = \alpha Ax + \beta y \f$. It will automatically + * broadcast the operation if applicable. + */ +template +StreamID GEMV(const T alpha, const Array& A, const Array& x, const T beta, const Array& y, + const StreamID& stream = DEF_CUBLAS_STREAM) { + + BatchInfo bi = Check::isBroadcastable(A, x, y, "A", "x", "y"); + CT_ERROR_IF(x.shape().cols(), !=, 1, "x must be a column vector"); + CT_ERROR_IF(y.shape().cols(), !=, 1, "x must be a column vector"); + + uint32_t rows = A.shape().rows(); + uint32_t cols = A.shape().cols(); + T a = alpha, b = beta; +#ifdef CUDA + CUBLAS_CHECK( + cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); + if (bi.size == 1) { + invoke(cublasSgemv, cublasDgemv, Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, + &a, A.dataDevice(), rows, x.dataDevice(), 1, &b, y.dataDevice(), 1); + + } else { // Greater than 2, so broadcast. + invoke(cublasSgemvStridedBatched, cublasDgemvStridedBatched, + Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, &a, A.dataDevice(), rows, + bi.strideA, x.dataDevice(), 1, bi.strideB, &b, y.dataDevice(), 1, bi.strideC, + bi.size); + } + +#else + if (bi.size == 1) { + y.eigenMap() = a * (A.eigenMap() * x.eigenMap()) + b * y.eigenMap(); + } else { // Greater than 2, so broadcast. +#pragma omp parallel for + for (uint32_t i = 0; i < bi.size; ++i) { + auto Ai = Array(A, {rows, cols}, i * bi.strideA).eigenMap(); + auto xi = Array(x, {cols, 1}, i * bi.strideB).eigenMap(); + auto yi = Array(y, {rows, 1}, i * bi.strideC).eigenMap(); + yi = a * (Ai * xi) + b * yi; + } + } +#endif + return StreamID{stream}; +} + +/** + * Computes the matrix-matrix product: \f$ C = \alpha AB + \beta C \f$. It will automatically + * broadcast the operation if applicable. + */ +template +StreamID GEMM(const T alpha, const Array& A, const Array& B, const T beta, const Array& C, + const StreamID& stream = DEF_CUBLAS_STREAM) { + + BatchInfo bi = Check::isBroadcastable(A, B, C, "A", "B", "C"); + // A is m x k, B is k x n. + uint32_t m = A.shape().rows(); + uint32_t k = A.shape().cols(); + uint32_t n = B.shape().cols(); + + T a = alpha, b = beta; +#ifdef CUDA + CUBLAS_CHECK( + cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); + if (bi.size == 1) { + invoke(cublasSgemm, cublasDgemm, Manager::get()->cublasHandle(), CUBLAS_OP_N, + CUBLAS_OP_N, m, n, k, &a, A.dataDevice(), m, B.dataDevice(), k, &b, + C.dataDevice(), m); + + } else { // Greater than 2, so broadcast. + invoke(cublasSgemmStridedBatched, cublasDgemmStridedBatched, + Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &a, + A.dataDevice(), m, bi.strideA, B.dataDevice(), k, bi.strideB, &b, C.dataDevice(), + m, bi.strideC, bi.size); + } + +#else + if (bi.size == 1) { + C.eigenMap() = a * (A.eigenMap() * B.eigenMap()) + b * C.eigenMap(); + } else { // Greater than 2, so broadcast. +#pragma omp parallel for + for (uint32_t i = 0; i < bi.size; ++i) { + auto Ai = Array(A, {m, k}, i * bi.strideA).eigenMap(); + auto Bi = Array(B, {k, n}, i * bi.strideB).eigenMap(); + auto Ci = Array(C, {m, n}, i * bi.strideC).eigenMap(); + Ci = a * (Ai * Bi) + b * Ci; + } + } +#endif + return StreamID{stream}; +} + +/** + * Computes the diagonal matrix multiplication: \f$ C = A\mathrm{diag}(X) \f$, or \f$ C = + * \mathrm{diag}(X)A \f$ if left = true. + */ +template +StreamID DGMM(const Array& A, const Array& X, const Array& C, const bool left = false, + const StreamID& stream = DEF_CUBLAS_STREAM) { + CT_ERROR_IF(X.shape().cols(), !=, 1, "'x' must be a column vector."); + if (left) { + CT_ERROR_IF(A.shape().rows(), !=, X.shape().rows(), + "Rows of 'A' and length of 'x' need to match."); + } else { + CT_ERROR_IF(A.shape().cols(), !=, X.shape().rows(), + "Columns of 'A' and length of 'x' need to match."); + } + CT_ERROR_IF(A.shape().rows(), !=, C.shape().rows(), + "Rows of 'A' and rows() of 'C' need to match."); + CT_ERROR_IF(A.shape().cols(), !=, C.shape().cols(), + "Rows of 'A' and columns of 'C' need to match."); + +#ifdef CUDA + uint32_t m = C.shape().rows(); + uint32_t n = C.shape().cols(); + auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; + CUBLAS_CHECK( + cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); + invoke(cublasSdgmm, cublasDdgmm, Manager::get()->cublasHandle(), m, n, A.dataDevice(), + A.shape().rows(), X.dataDevice(), 1, C.dataDevice(), m); +#else + if (left) { + C.eigenMap() = X.eigenMap().asDiagonal() * A.eigenMap(); + } else { + C.eigenMap() = A.eigenMap() * X.eigenMap().asDiagonal(); + } +#endif + return StreamID{stream}; +} + +////////////////////////////// +// PLUArray Related Objects // +////////////////////////////// + +/////////////////////////// +// PartialPivLU Wrapper // +/////////////////////////// + +// This class is just a workaround to use Eigen's internals directly. +template class PartialPivLU; +namespace internal { +template static Array empty({1, 1}); +template static EigenMapMat empty_map = empty.eigenMap(); +}; // namespace internal + +template class PLUArray; +// This is a wrapper class for Eigen's class so we have more controlled access to +// the underlying data. +template class PartialPivLU : public Eigen::PartialPivLU>> { + private: + using Base = Eigen::PartialPivLU>>; + template friend class PLUArray; + + EigenMapMat mMapLU; + EigenMapMat mMapPivots; + + public: + PartialPivLU() + : Base(internal::empty_map), mMapLU(internal::empty_map), + mMapPivots(internal::empty_map){}; + + void make(const Array& lu, const Array& pivots) { + + new (&mMapLU) EigenMapMat(lu.eigenMap()); + new (&mMapPivots) EigenMapMat(pivots.atLeast2D().eigenMap()); + + new (&this->m_lu) decltype(Base::m_lu)(mMapLU.derived()); + new (&this->m_p) decltype(Base::m_p)(mMapPivots.derived()); + + // new (&this->m_rowsTranspositions) decltype(Base::m_rowsTranspositions)( + // mMapPivots.derived()); + + this->m_l1_norm = 0; + this->m_det_p = 0; + this->m_isInitialized = true; + }; +}; + +namespace internal { +// We only create one and copy-construct to avoid the re-initialization. +template static PartialPivLU BlankPPLU = PartialPivLU(); +}; // namespace internal + +/** + * Class for storing the PLU decomposition an Array. This is restricted to floating point types. + */ +template class PLUArray { + private: + Array mLU; + Array mPivots; + PartialPivLU mPPLU = internal::BlankPPLU; + + public: + PLUArray() = delete; + + /** + * Constructor for a PLUArray given the matrix dimension. + */ + PLUArray(const uint32_t n) : mLU({n, n}), mPivots({n}) { mPPLU.make(mLU, mPivots); }; + + /** + * Constructor for a PLUArray given an existing array. + */ + PLUArray(const Array& arr) + : mLU((arr.isView()) ? arr.view() : arr), mPivots({arr.shape().rows()}) { + CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); + CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); + mPPLU.make(mLU, mPivots); + }; + + /** + * Constructor for a PLUArray given an existing location in memory for both the matrix and + * the pivots. + */ + PLUArray(const Array& arr, const Array pivots) + : mLU(arr.view()), mPivots(pivots.view()) { + CT_ERROR_IF(mLU.shape().axes(), !=, 2, "Array must be a 2D matrix"); + CT_ERROR_IF(mLU.shape().rows(), !=, mLU.shape().cols(), "Matrix must be square"); + mPPLU.make(mLU, mPivots); + }; + + uint32_t rank() { return mLU.shape().rows(); }; /**< Gets the rank of the LU matrix. */ + Array LU() const { return mLU.view(); }; /**< Gets the LU matrix. */ + Array pivots() const { return mPivots.view(); }; /**< Gets the LU matrix. */ + + /** + * Comptues the inplace LU factorization for this array on CPU. + */ + void computeLU() { + mPPLU.compute(); + mPPLU.mMapPivots = mPPLU.permutationP().indices(); + }; + + /** + * Solves the system \f$ LUx = b \f$ and returns \f$x\f$. + */ + Array solve(const Array& b) { + Array x(b.shape()); + x.eigenMap() = mPPLU.solve(b.eigenMap()); + return x; + }; +}; + +/** + * This is a batch version of PLUArray, to enable usage of the cuBLAS API. This is restricted to + * floating point types. + */ +template ::value, bool> = true> +class PLUBatch : public Batch { + private: + Array mPivotsBatch; + Array mInfoLU; + int32_t mInfoSolve; + + bool mInitialized = false; + + public: + /** + * Constructor of a PLUBatch from a given batch size. + */ + PLUBatch(const uint32_t size) : Batch(size), mInfoLU({size}){}; + + /** + * Constructor of a PLUBatch from a multi-dimensional array, batched across upper dimensions. + */ + PLUBatch(const Array& arr) : Batch(arr) { + Check::isSquare(arr, "LU Array"); + + mPivotsBatch = Array({this->mBatchSize * this->mShape.rows()}); + mInfoLU = Array({this->mBatchSize}); + }; + + /** + * Indexing operator which returns the PLUArray in the PLUBatch at the given index. + */ + PLUArray operator[](const uint32_t index) const { + CT_ERROR_IF(index, >=, this->mBatchSize, "Index exceeds batch size"); + Array lu(this->mBatch[index], {this->mShape.rows(), this->mShape.cols()}); + Array pivots(mPivotsBatch.data() + index * this->mShape.rows(), + {this->mShape.rows()}); + return PLUArray(lu, pivots); + }; + + /** + * Computes the inplace PLU decomposition of batch of arrays. + */ + StreamID computeLU(const StreamID& stream = DEF_CUBLAS_STREAM) { +#ifdef CUDA + uint32_t n = this->mShape.rows(); + CUBLAS_CHECK( + cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); + invoke(cublasSgetrfBatched, cublasDgetrfBatched, Manager::get()->cublasHandle(), n, + this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), mInfoLU.dataDevice(), + this->mBatchSize); + +#else +#pragma omp parallel for + for (uint32_t i = 0; i < this->mBatchSize; ++i) { + (*this)[i].computeLU(); + } +#endif + mInitialized = true; + return stream; + }; + + /** + * Solves the batched system \f$LUx = b\f$ inplace. The solution \f$x\f$ is written back into + * \f$b\f$. + */ + StreamID solve(const Batch& b, const StreamID& stream = DEF_CUBLAS_STREAM) { + CT_ERROR(not mInitialized, + "Cannot solve system if PLUBatch has not yet computed its LU decomposition"); + CT_ERROR_IF(b.size(), !=, this->mBatchSize, + "Upper dimensions of b do not match batch size"); + CT_ERROR_IF(b.shape().rows(), !=, this->mShape.rows(), + "The length of each column of b must match the matrix rank"); + +#ifdef CUDA + uint32_t n = b.shape().rows(); + uint32_t nrhs = b.shape().cols(); + CUBLAS_CHECK( + cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id))); + invoke(cublasSgetrsBatched, cublasDgetrsBatched, Manager::get()->cublasHandle(), + CUBLAS_OP_N, n, nrhs, this->mBatch.dataDevice(), n, mPivotsBatch.dataDevice(), + b.batch().dataDevice(), n, &mInfoSolve, this->mBatchSize); + +#else +#pragma omp parallel for + for (uint32_t i = 0; i < this->mBatchSize; ++i) { + b[i] = (*this)[i].solve(b[i]); + } +#endif + return stream; + }; + + /** + * Gets the pivots data from the device to the host. Does nothing for CPU. + */ + StreamID getPivots(const StreamID& stream = DEF_MEM_STREAM) const { + mPivotsBatch.updateHost(stream); + return stream; + }; + + /** + * Gets the info array for the LU decomposition for the device to the host. Does not + * return useful information for CPU. + */ + Array getLUInfo() const { + mInfoLU.updateHost().wait(); + return mInfoLU; + }; + + /** + * Checks validity of the solve operation. Does not return useful information for CPU. + */ + int32_t validSolve() const { return mInfoSolve == 0; } +}; + +// /** +// * Gets the inverse of each A[i], using an already PLU factorized A[i]. +// * Only available if compiling with CUDA. +// */ +// template +// void inverseBatch(const Array& batchA, const Array& batchC, const Array& +// pivots, +// const Array& info, const Shape shapeA, const Shape shapeC, +// const uint stream = 0) { +// #ifdef CUDA +// CT_ERROR_IF(shapeA.rows(), !=, shapeA.cols(), +// "'A' needs to be square, rows() and column need to match."); +// CT_ERROR_IF(shapeA.rows(), !=, shapeC.cols(), "'A' needs to be the same shape as +// 'C'."); CT_ERROR_IF(shapeA.rows(), !=, shapeC.rows(), "'A' needs to be the same shape +// as 'C'."); + +// CT_ERROR_IF(shapeA.rows(), !=, pivots.shape().rows(), +// "Rows()/columns of 'A' and rows() of pivots need to match."); +// CT_ERROR_IF(batchA.shape().rows(), !=, pivots.shape().cols(), +// "Batch size and columns of pivots need to match."); +// CT_ERROR_IF(info.shape().cols(), !=, 1, "Info needs to be a column vector.") +// CT_ERROR_IF(batchA.shape().rows(), !=, info.shape().rows(), +// "Batch size and length of info need to match."); +// CT_ERROR_IF(batchA.shape().rows(), !=, batchC.shape().rows(), +// "Batches 'A[i]' and 'C[i]' need to match."); + +// std::string s = "cublas" + std::to_string(stream); +// CUBLAS_CHECK( +// cublasSetStream(Manager::get()->cublasHandle(), +// Manager::get()->stream(s))); +// invoke(cublasSgetriBatched, cublasDgetriBatched, +// Manager::get()->cublasHandle(), +// shapeA.rows(), batchA.dataDevice(), shapeA.rows(), pivots.dataDevice(), +// batchC.dataDevice(), shapeC.rows(), info.dataDevice(), +// batchA.shape().rows()); +// #else +// CT_ERROR_IF(true, ==, true, "inverseBatch is not callable without CUDA."); +// #endif +// } + +}; // namespace BLAS +}; // namespace CudaTools + +#endif diff --git a/Core.h b/Core.h new file mode 100644 index 0000000..2f878d8 --- /dev/null +++ b/Core.h @@ -0,0 +1,544 @@ +#ifndef CUDATOOLS_H +#define CUDATOOLS_H + +#include "Macros.h" +#include +#include +#include +#include + +namespace CudaTools { + +/** + * Simple wrapper for the name of a stream. Its purposes is to allow for + * 'streams' to be passed on host code, and allowing for simple syntax + * for waiting. + */ +struct StreamID { + public: + std::string id; + StreamID() : id(""){}; + /** + * The constructor for a StreamID. + */ + StreamID(const std::string& id_) : id(id_){}; + StreamID(const char* id_) : id(id_){}; + + /** + * Waits for the stream with this stream ID. + */ + void wait() const; +}; + +static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"}; +static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"}; +static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"}; + +/** + * Allocates memory on the device. + */ +void* malloc(const size_t size); + +/** + * Pins memory on the host. + */ +void pin(void* const pHost, const size_t size); + +/** + * Pushes memory from the device to the host. + */ +StreamID push(void* const pHost, void* const pDevice, const size_t size, + const StreamID& stream = DEF_MEM_STREAM); +/** + * Pulls memory from the device back to the host. + */ +StreamID pull(void* const pHost, void* const pDevice, const size_t size, + const StreamID& stream = DEF_MEM_STREAM); +/** + * Copies memory on the device to another location on the device. + */ +StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, + const StreamID& stream = DEF_MEM_STREAM); + +/** + * Frees memory on the device. + */ +void free(void* const pDevice); + +#ifdef CUDACC +cudaDeviceProp getDeviceProp(); +static cudaDeviceProp DeviceProperties = getDeviceProp(); +const char* cublasGetErrorString(cublasStatus_t status); +#endif + +/** + * A class that manages various CUDA Runtime components, such as + * streams, events, and handles. + */ +class Manager { + private: + static Manager mManagerInstance; + Manager(const std::vector& names); + ~Manager(); +#ifdef CUDACC + std::unordered_map mStreams; + cublasHandle_t mCublas; +#endif + public: + /** + * Used to get the global CudaTools::Manager instance. + */ + static Manager* get() { return &mManagerInstance; }; + + void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */ + void sync() const; /**< Waits until all device code has finished. */ + void addStream(const std::string& name); /**< Creates a stream with the given name. */ +#ifdef CUDACC + cudaStream_t stream(const StreamID& stream) const; + cublasHandle_t cublasHandle() const; +#endif +}; + +namespace Kernel { + +/** + * A struct that contains the kernel launch parameters. + */ +struct Settings { + public: +#ifdef CUDACC + dim3 blockGrid; + dim3 threadBlock; + size_t sharedMemoryBytes = 0; +#else + size_t threads; +#endif + StreamID stream; + + Settings() = default; + + void setGridDim(const size_t x); /**< Sets the Grid dimensions. */ + void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */ + void setGridDim(const size_t x, const size_t y, + const size_t z); /**< Sets the Grid dimensions. */ + void setBlockDim(const size_t x); /**< Sets the Thread Block dimensions. */ + void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */ + void setBlockDim(const size_t x, const size_t y, + const size_t z); /**< Sets the Thread Block dimensions. */ + + void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */ + void setStream(const StreamID& stream); /**< Sets the stream. */ +}; + +/** + * Returns a kernel launch parameters based on the number of threads, and optionally + * a stream. Should only be used for 'embarassingly parallel' situations, or where + * each thread corresponds some sort of index. + */ +Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM); + +}; // namespace Kernel + +template class Array; + +/** + * A class that holds information about an Array. + */ +class Shape { + private: + template friend class Array; + uint32_t mAxes; + uint32_t mItems; + uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0}; + uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0}; + + public: + HD Shape() : mAxes(0), mItems(1){}; + /** + * The constructor for a Shape. + * \param dims an initializer list of the dimensions. + */ + HD Shape(const std::initializer_list dims); + + HD uint32_t axes() const; /**< Gets the number of axes. */ + HD uint32_t items() const; /**< Gets the total number of items. */ + + HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension + of the last axis. */ + HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the + dimension of the second to last axis. */ + HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets the + dimension of the second to last axis. */ + + HD uint32_t + dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */ + HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */ + + /** + * Gets the shape at a specific axis of this shape. + * \param axis the axis of where the new shape starts. + */ + HD Shape subshape(const uint32_t axis) const; + + HD bool operator==(const Shape& s) const; /**< Equals operator. */ + HD bool operator!=(const Shape& s) const; /**< Not equals operator. */ +}; + +std::ostream& operator<<(std::ostream& out, const Shape& s); + +}; // namespace CudaTools + +#ifdef CUDATOOLS_IMPLEMENTATION + +namespace CudaTools { + +template +StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) { +#ifdef CUDA + func<<stream(sett.stream.id)>>>(args...); +#else + func(args...); +#endif + return sett.stream; +} + +//////////////////// +// Memory Methods // +//////////////////// + +void StreamID::wait() const { Manager::get()->waitFor(id); } + +void* malloc(const size_t size) { +#ifdef CUDACC + void* pDevice; + CUDA_CHECK(cudaMalloc(&pDevice, size)); + return pDevice; +#else + return nullptr; +#endif +} + +void free(void* const pDevice) { +#ifdef CUDACC + if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice)); +#endif +} + +StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { +#ifdef CUDACC + CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice, + Manager::get()->stream(stream.id))); +#endif + return stream; +} + +StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { +#ifdef CUDACC + CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost, + Manager::get()->stream(stream.id))); +#endif + return stream; +} + +StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, + const StreamID& stream) { +#ifdef CUDACC + CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice, + Manager::get()->stream(stream.id))); +#endif + return stream; +} + +void pin(void* const pHost, const size_t size) { +#ifdef CUDACC + CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault)); +#endif +} + +#ifdef CUDACC +cudaDeviceProp getDeviceProp() { + cudaSetDevice(0); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + return deviceProp; +} +#endif + +///////////////////// +// Manager Methods // +///////////////////// + +Manager::Manager(const std::vector& names) { +#ifdef CUDACC + for (auto name : names) { + addStream(name); + } + CUBLAS_CHECK(cublasCreate(&mCublas)); +#endif +} + +Manager::~Manager() { +#ifdef CUDACC + for (auto& it : mStreams) { + CUDA_CHECK(cudaStreamDestroy(it.second)); + } + CUBLAS_CHECK(cublasDestroy(mCublas)); +#endif +} + +void Manager::waitFor(const StreamID& stream) const { +#ifdef CUDACC + auto it = mStreams.find(stream.id); + if (it != mStreams.end()) { + CUDA_CHECK(cudaStreamSynchronize(it->second)); + } else { + CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); + } +#endif +} + +void Manager::sync() const { +#ifdef CUDACC + CUDA_CHECK(cudaDeviceSynchronize()); +#endif +} + +void Manager::addStream(const std::string& name) { +#ifdef CUDACC + cudaStream_t s; + CUDA_CHECK(cudaStreamCreate(&s)); + mStreams[name] = s; +#endif +} + +#ifdef CUDACC +cudaStream_t Manager::stream(const StreamID& stream) const { + auto it = mStreams.find(stream.id); + if (it != mStreams.end()) { + return it->second; + } else { + CT_ERROR(true, ("Invalid stream " + stream.id).c_str()); + } +} + +cublasHandle_t Manager::cublasHandle() const { return mCublas; }; + +Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"}); +#else +Manager Manager::mManagerInstance = Manager({""}); +#endif + +//////////////////// +// Kernel Methods // +//////////////////// + +namespace Kernel { + +void Settings::setGridDim(const size_t x) { +#ifdef CUDACC + CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large") + blockGrid.x = x; + blockGrid.y = 1; + blockGrid.z = 1; +#endif +} + +void Settings::setGridDim(const size_t x, const size_t y) { +#ifdef CUDACC + CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); + CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); + CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); + blockGrid.x = x; + blockGrid.y = y; + blockGrid.z = 1; +#endif +} + +void Settings::setGridDim(const size_t x, const size_t y, const size_t z) { +#ifdef CUDACC + CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); + CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); + CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); + CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large."); + blockGrid.x = x; + blockGrid.y = y; + blockGrid.z = z; +#endif +} + +void Settings::setBlockDim(const size_t x) { +#ifdef CUDACC + CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); + threadBlock.x = x; + threadBlock.y = 1; + threadBlock.z = 1; +#endif +} + +void Settings::setBlockDim(const size_t x, const size_t y) { +#ifdef CUDACC + CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); + CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); + CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); + threadBlock.x = x; + threadBlock.y = y; + threadBlock.z = 1; +#endif +} + +void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) { +#ifdef CUDACC + CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); + CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); + CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); + CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large."); + threadBlock.x = x; + threadBlock.y = y; + threadBlock.z = z; +#endif +} + +void Settings::setSharedMemSize(const size_t bytes) { +#ifdef CUDACC + sharedMemoryBytes = bytes; +#endif +} + +void Settings::setStream(const StreamID& stream_) { +#ifdef CUDACC + stream.id = stream_.id; +#endif +} + +Settings basic(const size_t threads, const StreamID& stream) { + Settings sett; +#ifdef CUDACC + auto max_threads = DeviceProperties.maxThreadsPerBlock; + size_t grid_blocks = (threads + max_threads - 1) / max_threads; // ceil(threads / max_threads) + size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks) + sett.setGridDim(grid_blocks); + sett.setBlockDim(block_threads); + sett.setStream(stream); +#else + sett.threads = threads; +#endif + return sett; +} +} // namespace Kernel + +///////////////////// +// Shape Functions // +///////////////////// + +HD Shape::Shape(const std::initializer_list dims) : mAxes(dims.size()), mItems(1) { + CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes"); + mAxes = dims.size(); + if (mAxes == 0) return; + + auto it = dims.end() - 1; + mItems = 1; + for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) { + uint32_t dim = *it; + CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0"); + + mAxisDim[iAxis] = dim; + mStride[iAxis] = mItems; + mItems *= dim; + --it; + } + + if (mAxes == 1) return; + // Swap last two, for column major storage. + mStride[mAxes - 2] = 1; + mStride[mAxes - 1] = mAxisDim[mAxes - 2]; +} + +HD uint32_t Shape::axes() const { return mAxes; }; +HD uint32_t Shape::items() const { return mItems; }; +HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; } + +HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; } + +HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; } + +HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; } +HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; } + +HD bool Shape::operator==(const Shape& s) const { + if (mAxes != s.mAxes) { + return false; + } + for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) { + if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) { + return false; + } + } + return true; +} + +HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); } + +HD Shape Shape::subshape(const uint32_t axis) const { + CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes."); + if (axis == mAxes) return Shape({1}); + + Shape new_shape({}); + new_shape.mAxes = mAxes - axis; + new_shape.mItems = mItems; + + for (uint32_t iAxis = 0; iAxis < axis; iAxis++) { + new_shape.mItems /= mAxisDim[iAxis]; + } + for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) { + new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis]; + new_shape.mStride[iAxis - axis] = mStride[iAxis]; + } + return new_shape; +} + +std::ostream& operator<<(std::ostream& out, const Shape& s) { + out << "("; + if (s.axes() == 0) return out << ")"; + for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) { + out << s.dim(iAxis) << ", "; + } + return out << s.dim(s.axes() - 1) << ")"; +} + +#ifdef CUDACC +const char* cublasGetErrorString(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +}; // namespace CudaTools +#endif // CUDATOOLS_IMPLEMENTATION + +#endif // CUDATOOLS_H diff --git a/Macros.h b/Macros.h new file mode 100644 index 0000000..4ffce1a --- /dev/null +++ b/Macros.h @@ -0,0 +1,297 @@ +#ifndef MACROS_H +#define MACROS_H + +#include +#include +#include + +#if defined(CUDA) && defined(__CUDACC__) +#define CUDACC +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0) +#define DEVICE +#endif + +#ifdef CUDATOOLS_DOXYGEN +/** + * \def CUDACC + * This macro is defined when this code is being compiled by nvcc and the CUDA compilation + * flag is set. This should be used to enclose code where CUDA specific libraries and syntax are + * being used. + */ +#define CUDACC + +/** + * \def DEVICE + * This macro is defined when this code is being compiled for the device. The difference between + * this and CUDACC is that this should exclusively be used to dcide if code is being compiled + * to execute on the device. CUDACC is only determines what compiler is being used. + */ +#define DEVICE + +/** + * \def HD + * Mark a function in front with this if it needs to be callable on both the + * CPU and CUDA device. + */ +#define HD + +/** + * \def SHARED + * Mark a variable as static shared memory. + */ +#define SHARED + +/** + * \def DECLARE_KERNEL(call, ...) + * Used to declare (in header) a CUDA kernel. + * \param call the name of the kernel + * \param ... the arguments of the kernel + */ +#define DECLARE_KERNEL(call, ...) + +/** + * \def DEFINE_KERNEL(call, ...) + * Used to define (in implementation) a CUDA kernel. + * \param call the name of the kernel + * \param ... the arguments of the kernel + */ +#define DEFINE_KERNEL(call, ...) + +/** + * \def KERNEL(call, settings, ...) + * Used to call a CUDA kernel. + * \param call the name of the kernel + * \param settings the associated CudaTools::Kernel::Settings to initialize the kernel with + * \param ... the arguments of the kernel + */ +#define KERNEL(call, settings, ...) + +/** + * \def BASIC_LOOP(N) + * Can be used in conjunction with CudaTools::Kernel::Basic, which is mainly used for embarassingly + * parallel situations. Exposes the loop/thread number as iThread. + * \param N number of iterations + */ +#define BASIC_LOOP(N) + +/** + * \def DEVICE_CLASS(name) + * Can be used inside a class declaration (header) which generates boilerplate code to allow this + * class to be used on the device. + * + * This macro creates a few functions:\n + * name* that(): returns the pointer to this instance on the device. + * + * void allocateDevice(): allocates the memory on the device for this class instance. + * + * CudaTools::StreamID updateHost(const CudaTools::StreamID& stream): updates the host instance + * of the class. + * + * CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream): updates + * the device instance of the class. + * \param name the name of the class + */ +#define DEVICE_CLASS(name) + +/** + * \def CT_ERROR_IF(a, op, b, msg) + * Used for throwing runtime errors given a condition with an operator. + */ +#define CT_ERROR_IF(a, op, b, msg) + +/** + * \def CT_ERROR(a, msg) + * Used for throwing runtime errors given a bool. + */ +#define CT_ERROR(a, msg) + +/** + * \def CUDA_CHECK(call) + * Gets the error generated by a CUDA function call if there is one. + * \param call CUDA function to check if there are errors when running. + */ +#define CUDA_CHECK(call) + +/** + * \def CUBLAS_CHECK(call) + * Gets the error generated by a cuBLAS function call if there is one. + * \param call cuBLAS function to check if there are errors when running. + */ +#define CUBLAS_CHECK(call) + +/** + * \def CUDA_MEM(call) + * Gets the GPU memory used from function call if there is one. + * \param call function to measure memory usage. + * \param name an identifier to use as a variable and when printing. Must satisfy variable naming. + */ +#define CUDA_MEM(call, name) +#endif + +/////////////////// +// KERNEL MACROS // +/////////////////// + +#ifdef CUDACC + +#include +#include + +#define HD __host__ __device__ +#define SHARED __shared__ + +#define DECLARE_KERNEL(call, ...) __global__ void call(__VA_ARGS__) + +#define DEFINE_KERNEL(call, ...) \ + template CudaTools::StreamID CudaTools::runKernel( \ + void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__); \ + __global__ void call(__VA_ARGS__) + +#else +#define HD +#define SHARED + +#define DECLARE_KERNEL(call, ...) void call(__VA_ARGS__) + +#define DEFINE_KERNEL(call, ...) \ + template CudaTools::StreamID CudaTools::runKernel( \ + void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__); \ + void call(__VA_ARGS__) + +#endif // CUDACC + +#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__) + +/////////////////// +// DEVICE MACROS // +/////////////////// + +#ifdef DEVICE + +#define BASIC_LOOP(N) \ + uint32_t iThread = blockIdx.x * blockDim.x + threadIdx.x; \ + if (iThread < N) +#else +#define BASIC_LOOP(N) _Pragma("omp parallel for") for (uint32_t iThread = 0; iThread < N; ++iThread) + +#endif + +////////////////// +// CLASS MACROS // +////////////////// + +#define UPDATE_FUNC(name) \ + inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream = \ + CudaTools::DEF_MEM_STREAM) { \ + return CudaTools::pull(this, that(), sizeof(name)); \ + }; \ + inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream = \ + CudaTools::DEF_MEM_STREAM) { \ + return CudaTools::push(this, that(), sizeof(name)); \ + } + +#ifdef CUDA + +#define DEVICE_CLASS(name) \ + private: \ + name* __deviceInstance__ = nullptr; \ + \ + public: \ + inline name* that() { return __deviceInstance__; } \ + inline void allocateDevice() { __deviceInstance__ = (name*)CudaTools::malloc(sizeof(name)); }; \ + UPDATE_FUNC(name) + +#else +#define DEVICE_CLASS(name) \ + public: \ + inline name* that() { return this; }; \ + inline void allocateDevice(){}; \ + UPDATE_FUNC(name) + +#endif + +#ifndef CUDATOOLS_ARRAY_MAX_AXES +/** + * \def CUDATOOLS_ARRAY_MAX_AXES + * The maximum number of axes/dimensions an CudaTools::Array can have. The default is + * set to 4, but can be manully set fit the program needs. + */ +#define CUDATOOLS_ARRAY_MAX_AXES 4 +#endif + +//////////////////// +// Error Checking // +//////////////////// + +#ifndef NO_DIMENSION_CHECK +#ifdef DEVICE +#define CT_ERROR_IF(a, op, b, msg) \ + if (a op b) { \ + printf("[ERROR] %s:%d\n | %s: (" #a ") " #op " (" #b ").\n", __FILE__, __LINE__, msg); \ + } + +#define CT_ERROR(a, msg) \ + if (a) { \ + printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \ + } +#else + +#define CT_ERROR_IF(a, op, b, msg) \ + if (a op b) { \ + std::ostringstream os_a; \ + std::ostringstream os_b; \ + os_a << a; \ + os_b << b; \ + printf("[ERROR] %s:%d\n | %s: (" #a ")%s " #op " (" #b ")%s.\n", __FILE__, __LINE__, msg, \ + os_a.str().c_str(), os_b.str().c_str()); \ + throw std::exception(); \ + } + +#define CT_ERROR(a, msg) \ + if (a) { \ + printf("[ERROR] %s:%d\n | %s: " #a ".\n", __FILE__, __LINE__, msg); \ + throw std::exception(); \ + } +#endif + +#endif // NO_DIMENSION_CHECK + +#if defined(CUDACC) && !defined(NO_CUDA_CHECK) + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + printf("[CUDA] %s:%d\n | %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + throw std::exception(); \ + } \ + } while (0) + +#define CUBLAS_CHECK(call) \ + do { \ + cublasStatus_t err = (call); \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + printf("[cuBLAS] %s:%d\n | %s\n", __FILE__, __LINE__, \ + CudaTools::cublasGetErrorString(err)); \ + throw std::exception(); \ + } \ + } while (0) + +#define CUDA_MEM(call, name) \ + size_t free_bef_##name, free_aft_##name; \ + cudaMemGetInfo(&free_bef_##name, NULL); \ + call; \ + CudaTools::Manager::get()->sync(); \ + cudaMemGetInfo(&free_aft_##name, NULL); \ + printf("[%s] GPU Memory Usage: %iMiB\n", #name, \ + (free_bef_##name - free_aft_##name) / (1024 * 1024)); + +#else +#define CUDA_CHECK(call) (call) +#define CUBLAS_CHECK(call) (call) +#define CUDA_MEM(call, name) (call) +#endif + +#endif // MACROS_H diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d1cabb7 --- /dev/null +++ b/Makefile @@ -0,0 +1,95 @@ +CC := g++-10 +NVCC := nvcc +CFLAGS := -Wall -std=c++17 -fopenmp -MMD +NVCC_FLAGS := -MMD -w -Xcompiler + +INCLUDE := +LIBS_DIR := +LIBS_DIR_GPU := /usr/local/cuda/lib64 +LIBS := +LIBS_GPU := cuda cudart cublas + +TARGET = tests +SRC_DIR = . +BUILD_DIR = build + +# Should not need to modify below. + +CPU_BUILD_DIR = $(BUILD_DIR)/cpu +GPU_BUILD_DIR = $(BUILD_DIR)/gpu + +SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp) + +# Get source files and object files. +GCC_SRC = $(filter-out %.cu.cpp ,$(SRC)) +NVCC_SRC = $(filter %.cu.cpp, $(SRC)) +GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o) +NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o) + +# If compiling for CPU, all go to GCC. Otherwise, they are split. +CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ)) +GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ)) +GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ)) + +# $(info $$GCC_SRC is [${GCC_SRC}]) +# $(info $$NVCC_SRC is [${NVCC_SRC}]) +# $(info $$GCC_OBJ is [${GCC_OBJ}]) +# $(info $$NVCC_OBJ is [${NVCC_OBJ}]) + +# $(info $$CPU_OBJ is [${CPU_OBJ}]) +# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}]) +# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}]) + +HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h) +CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d) +GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d) + +INC := $(INCLUDE:%=-I%) +LIB := $(LIBS_DIR:%=-L%) +LIB_GPU := $(LIBS_DIR_GPU:%=-L%) +LD := $(LIBS:%=-l%) +LD_GPU := $(LIBS_GPU:%=-l%) + +# Reminder: +# $< = first prerequisite +# $@ = the target which matched the rule +# $^ = all prerequisites + +.PHONY: all clean + +all : cpu gpu + +cpu: $(TARGET)CPU +gpu: $(TARGET)GPU + +$(TARGET)CPU: $(CPU_OBJ) + $(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS) + +$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) + $(CC) $(CFLAGS) -c -o $@ $< $(INC) + +# For GPU, we need to build the NVCC objects, the NVCC linked object, and the +# regular ones. Then, we link them all together. +$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) + $(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU) + +$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) + $(NVCC) --device-link $^ -o $@ + +$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) + $(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC) + +$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) + $(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC) + +-include $(CPU_DEPS) +-include $(GPU_DEPS) + +$(CPU_BUILD_DIR): + mkdir -p $@ + +$(GPU_BUILD_DIR): + mkdir -p $@ + +clean: + rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..24b9cfb --- /dev/null +++ b/README.rst @@ -0,0 +1,40 @@ +========= +CudaTools +========= +This is the documentation for CudaTools, a header-only library and framework +for the development of CPU-CUDA compatible applications. Using CudaTools enables +the creation of a single unified code that has both CPU and CUDA compilation targets with minimal need to +introduce ``#ifdef`` statements when code is essentially identical between the targets. + +For information on the library itself and its usage, view `documentation `__. The small code snippets and samples +seen in the documentation are in the folder ``samples``. + +Dependencies +============ +- Eigen + +In the future, we will make this dependency optional, but still provide support +for it. As of now, it is necessary. + +Building the Documentation +========================== +The documentation is built with `Doxygen `__ and `Sphinx `__. +So, first make sure you have Doxygen installed on your system, and make sure it is added +to your system path. Then, you will have to create a Python virtual environment +in the repository folder + +.. code-block:: bash + + $ python3 -m venv .venv + +After installing the required Python packages + +.. code-block:: bash + + $ pip install -r requirements + +you can now run the script + +.. code-block:: bash + + $ ./build_docs diff --git a/build_docs b/build_docs new file mode 100644 index 0000000..2c8fd33 --- /dev/null +++ b/build_docs @@ -0,0 +1,2 @@ +doxygen docs/Doxyfile +sphinx-build -b html docs/source docs/build/html diff --git a/docs/Doxyfile b/docs/Doxyfile new file mode 100644 index 0000000..1e208e6 --- /dev/null +++ b/docs/Doxyfile @@ -0,0 +1,2579 @@ +# Doxyfile 1.8.17 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "CudaTools" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = "0.0.1" + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = "docs/build" + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is +# Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See https://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# declarations. If set to NO, these declarations will be included in the +# documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# (including Cygwin) ands Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = "./" + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen +# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf and *.ice. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.doc \ + # *.txt / + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.ice + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# entity all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = NO + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/xcode/), introduced with OSX +# 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side JavaScript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /