#ifndef CUDATOOLS_H #define CUDATOOLS_H #include "Macros.h" #include #include #include #include #include #include namespace CudaTools { struct Event; /** * Simple wrapper for the name of a stream. Its purposes is to allow for * 'streams' to be passed on host code, and allowing for simple syntax * for waiting. */ struct StreamID { public: std::string mId; StreamID() : mId(""){}; /** * The constructor for a StreamID. */ StreamID(const std::string& id_) : mId(id_){}; StreamID(const char* id_) : mId(id_){}; void wait() const; /**< Makes host wait for this stream. */ /** * Makes this stream wait for this event. Does not block the host. */ void wait(const Event& event) const; }; static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"}; static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"}; static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"}; /** * Allocates memory on the device. */ void* malloc(const size_t size); /** * Pins memory on the host. */ void pin(void* const pHost, const size_t size); /** * Pushes memory from the device to the host. */ StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream = DEF_MEM_STREAM); /** * Pulls memory from the device back to the host. */ StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream = DEF_MEM_STREAM); /** * Copies memory on the device to another location on the device. */ StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, const StreamID& stream = DEF_MEM_STREAM); /** * Frees memory on the device. */ void free(void* const pDevice); #ifdef CUDACC cudaDeviceProp getDeviceProp(); static cudaDeviceProp DeviceProperties = getDeviceProp(); const char* cublasGetErrorString(cublasStatus_t status); #endif /** * A class that manages various CUDA Runtime components, such as * streams, events, and handles. */ class Manager { private: static Manager mManagerInstance; Manager(const std::vector& names); ~Manager(); #ifdef CUDACC std::unordered_map mStreams; cublasHandle_t mCublas; #endif public: /** * Used to get the global CudaTools::Manager instance. */ static Manager* get() { return &mManagerInstance; }; void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */ void sync() const; /**< Waits until all device code has finished. */ void addStream(const std::string& name); /**< Creates a stream with the given name. */ #ifdef CUDACC cudaStream_t stream(const StreamID& stream) const; cublasHandle_t cublasHandle() const; #endif }; namespace Kernel { /** * A struct that contains the kernel launch parameters. */ struct Settings { public: #ifdef CUDACC dim3 blockGrid; dim3 threadBlock; size_t sharedMemoryBytes = 0; #else size_t threads; #endif StreamID stream; Settings() = default; void setGridDim(const size_t x); /**< Sets the Grid dimensions. */ void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */ void setGridDim(const size_t x, const size_t y, const size_t z); /**< Sets the Grid dimensions. */ void setBlockDim(const size_t x); /**< Sets the Thread Block dimensions. */ void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */ void setBlockDim(const size_t x, const size_t y, const size_t z); /**< Sets the Thread Block dimensions. */ void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */ void setStream(const StreamID& stream); /**< Sets the stream. */ }; /** * Returns a kernel launch parameters based on the number of threads, and optionally * a stream. Should only be used for 'embarassingly parallel' situations, or where * each thread corresponds some sort of index. */ Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM); /** * Launches a kernel with the provided function, settings and its arguments. */ template StreamID launch(F func, const Kernel::Settings& sett, Args... args) { #ifdef CUDA func<<stream(sett.stream.mId)>>>(args...); #else func(args...); #endif return sett.stream; } }; // namespace Kernel template class Array; /** * A class that holds information about an Array. */ class Shape { private: template friend class Array; uint32_t mAxes; uint32_t mItems; uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0}; uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0}; public: HD Shape() : mAxes(0), mItems(1){}; /** * The constructor for a Shape. * \param dims an initializer list of the dimensions. */ HD Shape(const std::initializer_list dims); HD uint32_t axes() const; /**< Gets the number of axes. */ HD uint32_t items() const; /**< Gets the total number of items. */ HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension of the last axis. */ HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the dimension of the second to last axis. */ HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets the dimension of the second to last axis. */ HD uint32_t dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */ HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */ /** * Gets the shape at a specific axis of this shape. * \param axis the axis of where the new shape starts. */ HD Shape subshape(const uint32_t axis) const; HD bool operator==(const Shape& s) const; /**< Equals operator. */ HD bool operator!=(const Shape& s) const; /**< Not equals operator. */ }; std::ostream& operator<<(std::ostream& out, const Shape& s); /** * A simple class that manages a CUDA Event. */ struct Event { #ifdef CUDACC cudaEvent_t mEvent; #endif Event(); ~Event(); void record(const StreamID& stream); /**< Records a event from a stream. */ }; template struct FuncHolder { F mFunc; std::tuple mArgs; FuncHolder() = delete; FuncHolder(F func, Args... args) : mFunc(func), mArgs(std::make_tuple(args...)){}; static void run(void* data) { FuncHolder* fh = (FuncHolder*)(data); std::apply([fh](auto&&... args) { fh->mFunc(args...); }, fh->mArgs); }; }; /** * Accessory struct to deal with host callbacks for CUDA Graphs in a nice fashion. */ struct GraphTools { std::vector mHostData; std::vector mEvents; ~GraphTools(); /** * Within a function that is being stream captured, launch a host function that can * be captured into the graph. */ template void launchHostFunction(const StreamID& stream, F func, Args&&... args) { #ifdef CUDACC FuncHolder* fh = new FuncHolder(func, args...); mHostData.push_back((void*)fh); cudaHostFn_t run_func = fh->run; CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh)); #else func(args...); #endif } /** * Makes a new branch in the graph to be run in parallel by a new stream. * \param orig_stream the original stream to branch from. * \param branch_stream the stream of the new branch. */ void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream); /** * Joins a existing branch in the graph to collapse a parallel block. * \param orig_stream the original stream to join the branch to. * \param branch_stream the stream of the branch to join. */ void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream); }; /** * A class that manages CUDA Graphs. */ template class Graph { private: #ifdef CUDACC cudaGraph_t mGraph; cudaGraphExec_t mInstance; #endif FuncHolder mFuncHolder; StreamID mStream; public: Graph() = delete; /** * The constructor for a Graph, which captures the function. * \param func the function to capture. * \param stream the origin stream to use. * \param args the arguments of the function. */ Graph(const StreamID& stream, F func, Args... args) : mFuncHolder(func, args...), mStream(stream) { #ifdef CUDACC CUDA_CHECK( cudaStreamBeginCapture(Manager::get()->stream(mStream), cudaStreamCaptureModeGlobal)); mFuncHolder.run((void*)&mFuncHolder); CUDA_CHECK(cudaStreamEndCapture(Manager::get()->stream(mStream), &mGraph)); CUDA_CHECK(cudaGraphInstantiate(&mInstance, mGraph, NULL, NULL, 0)); #endif }; ~Graph() { #ifdef CUDACC CUDA_CHECK(cudaGraphDestroy(mGraph)); CUDA_CHECK(cudaGraphExecDestroy(mInstance)); #endif }; /** * Executes the instantiated graph, or simply runs the function with provided * arguments if compiling for CPU. */ StreamID execute() const { #ifdef CUDACC cudaGraphLaunch(mInstance, Manager::get()->stream(mStream)); #else mFuncHolder.run((void*)&mFuncHolder); #endif return mStream; } }; }; // namespace CudaTools #ifdef CUDATOOLS_IMPLEMENTATION namespace CudaTools { ////////////////////// // StreamID Methods // ////////////////////// void StreamID::wait() const { Manager::get()->waitFor(mId); } void StreamID::wait(const Event& event) const { #ifdef CUDACC CUDA_CHECK(cudaStreamWaitEvent(Manager::get()->stream(mId), event.mEvent, 0)); #endif } //////////////////// // Memory Methods // //////////////////// void* malloc(const size_t size) { #ifdef CUDACC void* pDevice; CUDA_CHECK(cudaMalloc(&pDevice, size)); return pDevice; #else return nullptr; #endif } void free(void* const pDevice) { #ifdef CUDACC if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice)); #endif } StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { #ifdef CUDACC CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice, Manager::get()->stream(stream))); #endif return stream; } StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) { #ifdef CUDACC CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost, Manager::get()->stream(stream))); #endif return stream; } StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size, const StreamID& stream) { #ifdef CUDACC CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice, Manager::get()->stream(stream))); #endif return stream; } void pin(void* const pHost, const size_t size) { #ifdef CUDACC CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault)); #endif } #ifdef CUDACC cudaDeviceProp getDeviceProp() { cudaSetDevice(0); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); return deviceProp; } #endif ///////////////////// // Manager Methods // ///////////////////// Manager::Manager(const std::vector& names) { #ifdef CUDACC for (auto name : names) { addStream(name); } CUBLAS_CHECK(cublasCreate(&mCublas)); #endif } Manager::~Manager() { #ifdef CUDACC for (auto& it : mStreams) { CUDA_CHECK(cudaStreamDestroy(it.second)); } CUBLAS_CHECK(cublasDestroy(mCublas)); #endif } void Manager::waitFor(const StreamID& stream) const { #ifdef CUDACC auto it = mStreams.find(stream.mId); if (it != mStreams.end()) { CUDA_CHECK(cudaStreamSynchronize(it->second)); } else { CT_ERROR(true, ("Invalid stream " + stream.mId).c_str()); } #endif } void Manager::sync() const { #ifdef CUDACC CUDA_CHECK(cudaDeviceSynchronize()); #endif } void Manager::addStream(const std::string& name) { #ifdef CUDACC cudaStream_t s; CUDA_CHECK(cudaStreamCreate(&s)); mStreams[name] = s; #endif } #ifdef CUDACC cudaStream_t Manager::stream(const StreamID& stream) const { auto it = mStreams.find(stream.mId); if (it != mStreams.end()) { return it->second; } else { CT_ERROR(true, ("Invalid stream " + stream.mId).c_str()); } } cublasHandle_t Manager::cublasHandle() const { return mCublas; }; Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"}); #else Manager Manager::mManagerInstance = Manager({""}); #endif //////////////////// // Kernel Methods // //////////////////// namespace Kernel { void Settings::setGridDim(const size_t x) { #ifdef CUDACC CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large") blockGrid.x = x; blockGrid.y = 1; blockGrid.z = 1; #endif } void Settings::setGridDim(const size_t x, const size_t y) { #ifdef CUDACC CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); blockGrid.x = x; blockGrid.y = y; blockGrid.z = 1; #endif } void Settings::setGridDim(const size_t x, const size_t y, const size_t z) { #ifdef CUDACC CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large."); CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large."); CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large."); CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large."); blockGrid.x = x; blockGrid.y = y; blockGrid.z = z; #endif } void Settings::setBlockDim(const size_t x) { #ifdef CUDACC CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); threadBlock.x = x; threadBlock.y = 1; threadBlock.z = 1; #endif } void Settings::setBlockDim(const size_t x, const size_t y) { #ifdef CUDACC CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); threadBlock.x = x; threadBlock.y = y; threadBlock.z = 1; #endif } void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) { #ifdef CUDACC CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large."); CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large."); CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large."); CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large."); threadBlock.x = x; threadBlock.y = y; threadBlock.z = z; #endif } void Settings::setSharedMemSize(const size_t bytes) { #ifdef CUDACC sharedMemoryBytes = bytes; #endif } void Settings::setStream(const StreamID& stream_) { #ifdef CUDACC stream = stream_; #endif } Settings basic(const size_t threads, const StreamID& stream) { Settings sett; #ifdef CUDACC auto max_threads = DeviceProperties.maxThreadsPerBlock; size_t grid_blocks = (threads + max_threads - 1) / max_threads; // ceil(threads / max_threads) size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks) sett.setGridDim(grid_blocks); sett.setBlockDim(block_threads); sett.setStream(stream); #else sett.threads = threads; #endif return sett; } }; // namespace Kernel ///////////////////// // Shape Functions // ///////////////////// HD Shape::Shape(const std::initializer_list dims) : mAxes(dims.size()), mItems(1) { CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes"); mAxes = dims.size(); if (mAxes == 0) return; auto it = dims.end() - 1; mItems = 1; for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) { uint32_t dim = *it; CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0"); mAxisDim[iAxis] = dim; mStride[iAxis] = mItems; mItems *= dim; --it; } if (mAxes == 1) return; // Swap last two, for column major storage. mStride[mAxes - 2] = 1; mStride[mAxes - 1] = mAxisDim[mAxes - 2]; } HD uint32_t Shape::axes() const { return mAxes; }; HD uint32_t Shape::items() const { return mItems; }; HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; } HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; } HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; } HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; } HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; } HD bool Shape::operator==(const Shape& s) const { if (mAxes != s.mAxes) { return false; } for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) { if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) { return false; } } return true; } HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); } HD Shape Shape::subshape(const uint32_t axis) const { CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes."); if (axis == mAxes) return Shape({1}); Shape new_shape({}); new_shape.mAxes = mAxes - axis; new_shape.mItems = mItems; for (uint32_t iAxis = 0; iAxis < axis; iAxis++) { new_shape.mItems /= mAxisDim[iAxis]; } for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) { new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis]; new_shape.mStride[iAxis - axis] = mStride[iAxis]; } return new_shape; } std::ostream& operator<<(std::ostream& out, const Shape& s) { out << "("; if (s.axes() == 0) return out << ")"; for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) { out << s.dim(iAxis) << ", "; } return out << s.dim(s.axes() - 1) << ")"; } /////////////////// // Event Methods // /////////////////// Event::Event() { #ifdef CUDACC CUDA_CHECK(cudaEventCreate(&mEvent)); #endif } Event::~Event() { #ifdef CUDACC CUDA_CHECK(cudaEventDestroy(mEvent)); #endif } void Event::record(const StreamID& stream) { #ifdef CUDACC CUDA_CHECK(cudaEventRecord(mEvent, Manager::get()->stream(stream))); #endif } //////////////////////// // GraphTools Methods // //////////////////////// GraphTools::~GraphTools() { #ifdef CUDACC for (void* func : mHostData) { delete func; } for (Event* event : mEvents) { delete event; } #endif } void GraphTools::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) { Event* event = new Event(); event->record(orig_stream); mEvents.push_back(event); branch_stream.wait(*event); } void GraphTools::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) { Event* event = new Event(); event->record(branch_stream); mEvents.push_back(event); orig_stream.wait(*event); } #ifdef CUDACC const char* cublasGetErrorString(cublasStatus_t error) { switch (error) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; } return ""; } #endif }; // namespace CudaTools #endif // CUDATOOLS_IMPLEMENTATION #endif // CUDATOOLS_H