A library and framework for developing CPU-CUDA compatible applications under one unified code.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

683 lines
19 KiB

#ifndef CUDATOOLS_H
#define CUDATOOLS_H
#include "Macros.h"
#include <functional>
#include <iostream>
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>
namespace CudaTools {
struct Event;
/**
* Simple wrapper for the name of a stream. Its purposes is to allow for
* 'streams' to be passed on host code, and allowing for simple syntax
* for waiting.
*/
struct StreamID {
public:
std::string mId;
StreamID() : mId(""){};
/**
* The constructor for a StreamID.
*/
StreamID(const std::string& id_) : mId(id_){};
StreamID(const char* id_) : mId(id_){};
void wait() const; /**< Makes host wait for this stream. */
/**
* Makes this stream wait for this event. Does not block the host.
*/
void wait(const Event& event) const;
};
static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
/**
* Allocates memory on the device.
*/
void* malloc(const size_t size);
/**
* Frees memory on the device.
*/
void free(void* const pDevice);
/**
* Copies memory from the source pointer to the dest pointer.
*/
StreamID copy(void* const source, void* const dest, const size_t size,
const StreamID& stream = DEF_MEM_STREAM);
/**
* Initializes or sets device memory to a value.
*/
StreamID memset(void* const pDevice, int value, const size_t size,
const StreamID& stream = DEF_MEM_STREAM);
/**
* Pins memory on the host.
*/
void pin(void* const pHost, const size_t size);
#ifdef CUDACC
cudaDeviceProp getDeviceProp();
static cudaDeviceProp DeviceProperties = getDeviceProp();
const char* cublasGetErrorString(cublasStatus_t status);
#endif
/**
* A class that manages various CUDA Runtime components, such as
* streams, events, and handles.
*/
class Manager {
private:
static Manager mManagerInstance;
Manager(const std::vector<std::string>& names);
~Manager();
#ifdef CUDACC
std::unordered_map<std::string, cudaStream_t> mStreams;
cublasHandle_t mCublas;
// cusparseHandle_t mCusparse;
#endif
public:
/**
* Used to get the global CudaTools::Manager instance.
*/
static Manager* get() { return &mManagerInstance; };
void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
void sync() const; /**< Waits until all device code has finished. */
void addStream(const std::string& name); /**< Creates a stream with the given name. */
#ifdef CUDACC
cudaStream_t stream(const StreamID& stream) const;
cublasHandle_t cublasHandle() const;
// cusparseHandle_t cusparseHandle() const;
#endif
};
namespace Kernel {
/**
* A struct that contains the kernel launch parameters.
*/
struct Settings {
public:
#ifdef CUDACC
dim3 blockGrid;
dim3 threadBlock;
size_t sharedMemoryBytes = 0;
#else
size_t threads;
#endif
StreamID stream;
Settings() = default;
void setGridDim(const size_t x); /**< Sets the Grid dimensions. */
void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
void setGridDim(const size_t x, const size_t y,
const size_t z); /**< Sets the Grid dimensions. */
void setBlockDim(const size_t x); /**< Sets the Thread Block dimensions. */
void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
void setBlockDim(const size_t x, const size_t y,
const size_t z); /**< Sets the Thread Block dimensions. */
void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
void setStream(const StreamID& stream); /**< Sets the stream. */
};
/**
* Returns a kernel launch parameters based on the number of threads, and optionally
* a stream. Should only be used for 'embarassingly parallel' situations, or where
* each thread corresponds some sort of index.
*/
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);
/**
* Launches a kernel with the provided function, settings and its arguments.
*/
template <typename F, typename... Args>
StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
#ifdef CUDACC
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
Manager::get()->stream(sett.stream.mId)>>>(args...);
#else
func(args...);
#endif
return sett.stream;
}
}; // namespace Kernel
template <typename T> class Array;
/**
* A class that holds information about an Array.
*/
class Shape {
private:
template <typename T> friend class Array;
uint32_t mAxes;
uint32_t mItems;
uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};
public:
HD Shape() : mAxes(0), mItems(1){};
/**
* The constructor for a Shape.
* \param dims an initializer list of the dimensions.
*/
HD Shape(const std::initializer_list<uint32_t> dims);
HD uint32_t axes() const; /**< Gets the number of axes. */
HD uint32_t items() const; /**< Gets the total number of items. */
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
dimension of the last axis. */
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
dimension of the second to last axis. */
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets
the dimension of the second to last axis. */
HD uint32_t
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */
/**
* Gets the shape at a specific axis of this shape.
* \param axis the axis of where the new shape starts.
*/
HD Shape subshape(const uint32_t axis) const;
HD bool operator==(const Shape& s) const; /**< Equals operator. */
HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
};
std::ostream& operator<<(std::ostream& out, const Shape& s);
/**
* A simple class that manages a CUDA Event.
*/
struct Event {
#ifdef CUDACC
cudaEvent_t mEvent;
#endif
Event();
~Event();
void record(const StreamID& stream); /**< Records a event from a stream. */
};
template <typename F, typename... Args> struct FuncHolder {
F mFunc;
std::tuple<Args...> mArgs;
FuncHolder() = delete;
FuncHolder(F func, Args... args) : mFunc(func), mArgs(std::make_tuple(args...)){};
static void run(void* data) {
FuncHolder<F, Args...>* fh = (FuncHolder<F, Args...>*)(data);
std::apply([fh](auto&&... args) { fh->mFunc(args...); }, fh->mArgs);
};
};
/**
* A class that manages CUDA Graphs.
*/
template <typename F, typename... Args> class Graph {
private:
#ifdef CUDACC
cudaGraph_t mGraph;
cudaGraphExec_t mInstance;
#endif
FuncHolder<F, Args...> mFuncHolder;
StreamID mStream;
public:
Graph() = delete;
/**
* The constructor for a Graph, which captures the function.
* \param func the function to capture.
* \param stream the origin stream to use.
* \param args the arguments of the function.
*/
Graph(const StreamID& stream, F func, Args... args)
: mFuncHolder(func, args...), mStream(stream) {
#ifdef CUDACC
CUDA_CHECK(
cudaStreamBeginCapture(Manager::get()->stream(mStream), cudaStreamCaptureModeGlobal));
mFuncHolder.run((void*)&mFuncHolder);
CUDA_CHECK(cudaStreamEndCapture(Manager::get()->stream(mStream), &mGraph));
CUDA_CHECK(cudaGraphInstantiate(&mInstance, mGraph, NULL, NULL, 0));
#endif
};
~Graph() {
#ifdef CUDACC
cudaGraphDestroy(mGraph);
cudaGraphExecDestroy(mInstance);
#endif
};
/**
* Executes the instantiated graph, or simply runs the function with provided
* arguments if compiling for CPU.
*/
StreamID execute() const {
#ifdef CUDACC
cudaGraphLaunch(mInstance, Manager::get()->stream(mStream));
#else
mFuncHolder.run((void*)&mFuncHolder);
#endif
return mStream;
}
};
/**
* A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
*/
struct GraphManager {
std::vector<void*> mHostData;
std::vector<Event*> mEvents;
~GraphManager();
/**
* Within a function that is being stream captured, launch a host function that can
* be captured into the graph.
*/
template <typename F, typename... Args>
void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
#ifdef CUDACC
FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
mHostData.push_back((void*)fh);
cudaHostFn_t run_func = fh->run;
CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
#else
func(args...);
#endif
}
/**
* Makes a new branch in the graph to be run in parallel by a new stream.
* \param orig_stream the original stream to branch from.
* \param branch_stream the stream of the new branch.
*/
void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
/**
* Joins a existing branch in the graph to collapse a parallel block.
* \param orig_stream the original stream to join the branch to.
* \param branch_stream the stream of the branch to join.
*/
void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
};
}; // namespace CudaTools
#endif // CUDATOOLS_H
#ifdef CUDATOOLS_IMPLEMENTATION
#ifndef __CUDATOOLS_IMPLEMENTED__
#define __CUDATOOLS_IMPLEMENTED__
namespace CudaTools {
//////////////////////
// StreamID Methods //
//////////////////////
void StreamID::wait() const { Manager::get()->waitFor(mId); }
void StreamID::wait(const Event& event) const {
#ifdef CUDACC
CUDA_CHECK(cudaStreamWaitEvent(Manager::get()->stream(mId), event.mEvent, 0));
#endif
}
////////////////////
// Memory Methods //
////////////////////
void* malloc(const size_t size) {
#ifdef CUDACC
void* pDevice;
CUDA_CHECK(cudaMalloc(&pDevice, size));
return pDevice;
#else
return nullptr;
#endif
}
void free(void* const pDevice) {
#ifdef CUDACC
if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
#endif
}
StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
#ifdef CUDACC
CUDA_CHECK(
cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
#endif
return stream;
}
StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
#ifdef CUDACC
CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
#endif
return stream;
}
void pin(void* const pHost, const size_t size) {
#ifdef CUDACC
CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
#endif
}
#ifdef CUDACC
cudaDeviceProp getDeviceProp() {
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
return deviceProp;
}
#endif
/////////////////////
// Manager Methods //
/////////////////////
Manager::Manager(const std::vector<std::string>& names) {
#ifdef CUDACC
for (auto name : names) {
addStream(name);
}
CUBLAS_CHECK(cublasCreate(&mCublas));
// CUSPARSE_CHECK(cusparseCreate(&mCusparse));
#endif
}
Manager::~Manager() {
#ifdef CUDACC
for (auto& it : mStreams) {
cudaStreamDestroy(it.second);
}
cublasDestroy(mCublas);
// CUSPARSE_CHECK(cusparseDestroy(mCusparse));
#endif
}
void Manager::waitFor(const StreamID& stream) const {
#ifdef CUDACC
auto it = mStreams.find(stream.mId);
if (it != mStreams.end()) {
CUDA_CHECK(cudaStreamSynchronize(it->second));
} else {
CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
}
#endif
}
void Manager::sync() const {
#ifdef CUDACC
CUDA_CHECK(cudaDeviceSynchronize());
#endif
}
void Manager::addStream(const std::string& name) {
#ifdef CUDACC
cudaStream_t s;
CUDA_CHECK(cudaStreamCreate(&s));
mStreams[name] = s;
#endif
}
#ifdef CUDACC
cudaStream_t Manager::stream(const StreamID& stream) const {
auto it = mStreams.find(stream.mId);
if (it != mStreams.end()) {
return it->second;
} else {
CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
}
}
cublasHandle_t Manager::cublasHandle() const { return mCublas; };
// cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
#else
Manager Manager::mManagerInstance = Manager({""});
#endif
////////////////////
// Kernel Methods //
////////////////////
namespace Kernel {
void Settings::setGridDim(const size_t x) {
#ifdef CUDACC
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
blockGrid.x = x;
blockGrid.y = 1;
blockGrid.z = 1;
#endif
}
void Settings::setGridDim(const size_t x, const size_t y) {
#ifdef CUDACC
CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
blockGrid.x = x;
blockGrid.y = y;
blockGrid.z = 1;
#endif
}
void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
#ifdef CUDACC
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
blockGrid.x = x;
blockGrid.y = y;
blockGrid.z = z;
#endif
}
void Settings::setBlockDim(const size_t x) {
#ifdef CUDACC
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
threadBlock.x = x;
threadBlock.y = 1;
threadBlock.z = 1;
#endif
}
void Settings::setBlockDim(const size_t x, const size_t y) {
#ifdef CUDACC
CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
threadBlock.x = x;
threadBlock.y = y;
threadBlock.z = 1;
#endif
}
void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
#ifdef CUDACC
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
threadBlock.x = x;
threadBlock.y = y;
threadBlock.z = z;
#endif
}
void Settings::setSharedMemSize(const size_t bytes) {
#ifdef CUDACC
sharedMemoryBytes = bytes;
#endif
}
void Settings::setStream(const StreamID& stream_) {
#ifdef CUDACC
stream = stream_;
#endif
}
Settings basic(const size_t threads, const StreamID& stream) {
Settings sett;
#ifdef CUDACC
auto max_threads = DeviceProperties.maxThreadsPerBlock;
size_t grid_blocks = (threads + max_threads - 1) / max_threads; // ceil(threads / max_threads)
size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
sett.setGridDim(grid_blocks);
sett.setBlockDim(block_threads);
sett.setStream(stream);
#else
sett.threads = threads;
#endif
return sett;
}
}; // namespace Kernel
/////////////////////
// Shape Functions //
/////////////////////
HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
mAxes = dims.size();
if (mAxes == 0) return;
auto it = dims.end() - 1;
mItems = 1;
for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
uint32_t dim = *it;
CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");
mAxisDim[iAxis] = dim;
mStride[iAxis] = mItems;
mItems *= dim;
--it;
}
if (mAxes == 1) return;
// Swap last two, for column major storage.
mStride[mAxes - 2] = 1;
mStride[mAxes - 1] = mAxisDim[mAxes - 2];
}
HD uint32_t Shape::axes() const { return mAxes; };
HD uint32_t Shape::items() const { return mItems; };
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }
HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }
HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }
HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }
HD bool Shape::operator==(const Shape& s) const {
if (mAxes != s.mAxes) {
return false;
}
for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
return false;
}
}
return true;
}
HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }
HD Shape Shape::subshape(const uint32_t axis) const {
CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
if (axis == mAxes) return Shape({1});
Shape new_shape({});
new_shape.mAxes = mAxes - axis;
new_shape.mItems = mItems;
for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
new_shape.mItems /= mAxisDim[iAxis];
}
for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
new_shape.mStride[iAxis - axis] = mStride[iAxis];
}
return new_shape;
}
std::ostream& operator<<(std::ostream& out, const Shape& s) {
out << "(";
if (s.axes() == 0) return out << ")";
for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
out << s.dim(iAxis) << ", ";
}
return out << s.dim(s.axes() - 1) << ")";
}
///////////////////
// Event Methods //
///////////////////
Event::Event() {
#ifdef CUDACC
CUDA_CHECK(cudaEventCreate(&mEvent));
#endif
}
Event::~Event() {
#ifdef CUDACC
cudaEventDestroy(mEvent);
#endif
}
void Event::record(const StreamID& stream) {
#ifdef CUDACC
CUDA_CHECK(cudaEventRecord(mEvent, Manager::get()->stream(stream)));
#endif
}
//////////////////////////
// GraphManager Methods //
//////////////////////////
GraphManager::~GraphManager() {
#ifdef CUDACC
for (Event* event : mEvents) {
delete event;
}
#endif
}
void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
Event* event = new Event();
event->record(orig_stream);
mEvents.push_back(event);
branch_stream.wait(*event);
}
void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
Event* event = new Event();
event->record(branch_stream);
mEvents.push_back(event);
orig_stream.wait(*event);
}
}; // namespace CudaTools
#endif
#endif // CUDATOOLS_IMPLEMENTATION