You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
544 lines
16 KiB
544 lines
16 KiB
#ifndef CUDATOOLS_H
|
|
#define CUDATOOLS_H
|
|
|
|
#include "Macros.h"
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace CudaTools {
|
|
|
|
/**
|
|
* Simple wrapper for the name of a stream. Its purposes is to allow for
|
|
* 'streams' to be passed on host code, and allowing for simple syntax
|
|
* for waiting.
|
|
*/
|
|
struct StreamID {
|
|
public:
|
|
std::string id;
|
|
StreamID() : id(""){};
|
|
/**
|
|
* The constructor for a StreamID.
|
|
*/
|
|
StreamID(const std::string& id_) : id(id_){};
|
|
StreamID(const char* id_) : id(id_){};
|
|
|
|
/**
|
|
* Waits for the stream with this stream ID.
|
|
*/
|
|
void wait() const;
|
|
};
|
|
|
|
static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
|
|
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
|
|
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
|
|
|
|
/**
|
|
* Allocates memory on the device.
|
|
*/
|
|
void* malloc(const size_t size);
|
|
|
|
/**
|
|
* Pins memory on the host.
|
|
*/
|
|
void pin(void* const pHost, const size_t size);
|
|
|
|
/**
|
|
* Pushes memory from the device to the host.
|
|
*/
|
|
StreamID push(void* const pHost, void* const pDevice, const size_t size,
|
|
const StreamID& stream = DEF_MEM_STREAM);
|
|
/**
|
|
* Pulls memory from the device back to the host.
|
|
*/
|
|
StreamID pull(void* const pHost, void* const pDevice, const size_t size,
|
|
const StreamID& stream = DEF_MEM_STREAM);
|
|
/**
|
|
* Copies memory on the device to another location on the device.
|
|
*/
|
|
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
|
|
const StreamID& stream = DEF_MEM_STREAM);
|
|
|
|
/**
|
|
* Frees memory on the device.
|
|
*/
|
|
void free(void* const pDevice);
|
|
|
|
#ifdef CUDACC
|
|
cudaDeviceProp getDeviceProp();
|
|
static cudaDeviceProp DeviceProperties = getDeviceProp();
|
|
const char* cublasGetErrorString(cublasStatus_t status);
|
|
#endif
|
|
|
|
/**
|
|
* A class that manages various CUDA Runtime components, such as
|
|
* streams, events, and handles.
|
|
*/
|
|
class Manager {
|
|
private:
|
|
static Manager mManagerInstance;
|
|
Manager(const std::vector<std::string>& names);
|
|
~Manager();
|
|
#ifdef CUDACC
|
|
std::unordered_map<std::string, cudaStream_t> mStreams;
|
|
cublasHandle_t mCublas;
|
|
#endif
|
|
public:
|
|
/**
|
|
* Used to get the global CudaTools::Manager instance.
|
|
*/
|
|
static Manager* get() { return &mManagerInstance; };
|
|
|
|
void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
|
|
void sync() const; /**< Waits until all device code has finished. */
|
|
void addStream(const std::string& name); /**< Creates a stream with the given name. */
|
|
#ifdef CUDACC
|
|
cudaStream_t stream(const StreamID& stream) const;
|
|
cublasHandle_t cublasHandle() const;
|
|
#endif
|
|
};
|
|
|
|
namespace Kernel {
|
|
|
|
/**
|
|
* A struct that contains the kernel launch parameters.
|
|
*/
|
|
struct Settings {
|
|
public:
|
|
#ifdef CUDACC
|
|
dim3 blockGrid;
|
|
dim3 threadBlock;
|
|
size_t sharedMemoryBytes = 0;
|
|
#else
|
|
size_t threads;
|
|
#endif
|
|
StreamID stream;
|
|
|
|
Settings() = default;
|
|
|
|
void setGridDim(const size_t x); /**< Sets the Grid dimensions. */
|
|
void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
|
|
void setGridDim(const size_t x, const size_t y,
|
|
const size_t z); /**< Sets the Grid dimensions. */
|
|
void setBlockDim(const size_t x); /**< Sets the Thread Block dimensions. */
|
|
void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
|
|
void setBlockDim(const size_t x, const size_t y,
|
|
const size_t z); /**< Sets the Thread Block dimensions. */
|
|
|
|
void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
|
|
void setStream(const StreamID& stream); /**< Sets the stream. */
|
|
};
|
|
|
|
/**
|
|
* Returns a kernel launch parameters based on the number of threads, and optionally
|
|
* a stream. Should only be used for 'embarassingly parallel' situations, or where
|
|
* each thread corresponds some sort of index.
|
|
*/
|
|
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);
|
|
|
|
}; // namespace Kernel
|
|
|
|
template <typename T> class Array;
|
|
|
|
/**
|
|
* A class that holds information about an Array.
|
|
*/
|
|
class Shape {
|
|
private:
|
|
template <typename T> friend class Array;
|
|
uint32_t mAxes;
|
|
uint32_t mItems;
|
|
uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
|
|
uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};
|
|
|
|
public:
|
|
HD Shape() : mAxes(0), mItems(1){};
|
|
/**
|
|
* The constructor for a Shape.
|
|
* \param dims an initializer list of the dimensions.
|
|
*/
|
|
HD Shape(const std::initializer_list<uint32_t> dims);
|
|
|
|
HD uint32_t axes() const; /**< Gets the number of axes. */
|
|
HD uint32_t items() const; /**< Gets the total number of items. */
|
|
|
|
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
|
|
of the last axis. */
|
|
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
|
|
dimension of the second to last axis. */
|
|
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets the
|
|
dimension of the second to last axis. */
|
|
|
|
HD uint32_t
|
|
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
|
|
HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */
|
|
|
|
/**
|
|
* Gets the shape at a specific axis of this shape.
|
|
* \param axis the axis of where the new shape starts.
|
|
*/
|
|
HD Shape subshape(const uint32_t axis) const;
|
|
|
|
HD bool operator==(const Shape& s) const; /**< Equals operator. */
|
|
HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& out, const Shape& s);
|
|
|
|
}; // namespace CudaTools
|
|
|
|
#ifdef CUDATOOLS_IMPLEMENTATION
|
|
|
|
namespace CudaTools {
|
|
|
|
template <typename T, typename... Args>
|
|
StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) {
|
|
#ifdef CUDA
|
|
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
|
|
Manager::get()->stream(sett.stream.id)>>>(args...);
|
|
#else
|
|
func(args...);
|
|
#endif
|
|
return sett.stream;
|
|
}
|
|
|
|
////////////////////
|
|
// Memory Methods //
|
|
////////////////////
|
|
|
|
void StreamID::wait() const { Manager::get()->waitFor(id); }
|
|
|
|
void* malloc(const size_t size) {
|
|
#ifdef CUDACC
|
|
void* pDevice;
|
|
CUDA_CHECK(cudaMalloc(&pDevice, size));
|
|
return pDevice;
|
|
#else
|
|
return nullptr;
|
|
#endif
|
|
}
|
|
|
|
void free(void* const pDevice) {
|
|
#ifdef CUDACC
|
|
if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
|
|
#endif
|
|
}
|
|
|
|
StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
|
|
Manager::get()->stream(stream.id)));
|
|
#endif
|
|
return stream;
|
|
}
|
|
|
|
StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
|
|
Manager::get()->stream(stream.id)));
|
|
#endif
|
|
return stream;
|
|
}
|
|
|
|
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
|
|
const StreamID& stream) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
|
|
Manager::get()->stream(stream.id)));
|
|
#endif
|
|
return stream;
|
|
}
|
|
|
|
void pin(void* const pHost, const size_t size) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
|
|
#endif
|
|
}
|
|
|
|
#ifdef CUDACC
|
|
cudaDeviceProp getDeviceProp() {
|
|
cudaSetDevice(0);
|
|
cudaDeviceProp deviceProp;
|
|
cudaGetDeviceProperties(&deviceProp, 0);
|
|
return deviceProp;
|
|
}
|
|
#endif
|
|
|
|
/////////////////////
|
|
// Manager Methods //
|
|
/////////////////////
|
|
|
|
Manager::Manager(const std::vector<std::string>& names) {
|
|
#ifdef CUDACC
|
|
for (auto name : names) {
|
|
addStream(name);
|
|
}
|
|
CUBLAS_CHECK(cublasCreate(&mCublas));
|
|
#endif
|
|
}
|
|
|
|
Manager::~Manager() {
|
|
#ifdef CUDACC
|
|
for (auto& it : mStreams) {
|
|
CUDA_CHECK(cudaStreamDestroy(it.second));
|
|
}
|
|
CUBLAS_CHECK(cublasDestroy(mCublas));
|
|
#endif
|
|
}
|
|
|
|
void Manager::waitFor(const StreamID& stream) const {
|
|
#ifdef CUDACC
|
|
auto it = mStreams.find(stream.id);
|
|
if (it != mStreams.end()) {
|
|
CUDA_CHECK(cudaStreamSynchronize(it->second));
|
|
} else {
|
|
CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void Manager::sync() const {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
#endif
|
|
}
|
|
|
|
void Manager::addStream(const std::string& name) {
|
|
#ifdef CUDACC
|
|
cudaStream_t s;
|
|
CUDA_CHECK(cudaStreamCreate(&s));
|
|
mStreams[name] = s;
|
|
#endif
|
|
}
|
|
|
|
#ifdef CUDACC
|
|
cudaStream_t Manager::stream(const StreamID& stream) const {
|
|
auto it = mStreams.find(stream.id);
|
|
if (it != mStreams.end()) {
|
|
return it->second;
|
|
} else {
|
|
CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
|
|
}
|
|
}
|
|
|
|
cublasHandle_t Manager::cublasHandle() const { return mCublas; };
|
|
|
|
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
|
|
#else
|
|
Manager Manager::mManagerInstance = Manager({""});
|
|
#endif
|
|
|
|
////////////////////
|
|
// Kernel Methods //
|
|
////////////////////
|
|
|
|
namespace Kernel {
|
|
|
|
void Settings::setGridDim(const size_t x) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
|
|
blockGrid.x = x;
|
|
blockGrid.y = 1;
|
|
blockGrid.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setGridDim(const size_t x, const size_t y) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
|
|
blockGrid.x = x;
|
|
blockGrid.y = y;
|
|
blockGrid.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
|
|
CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
|
|
blockGrid.x = x;
|
|
blockGrid.y = y;
|
|
blockGrid.z = z;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setBlockDim(const size_t x) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
|
|
threadBlock.x = x;
|
|
threadBlock.y = 1;
|
|
threadBlock.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setBlockDim(const size_t x, const size_t y) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
|
|
threadBlock.x = x;
|
|
threadBlock.y = y;
|
|
threadBlock.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
|
|
CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
|
|
threadBlock.x = x;
|
|
threadBlock.y = y;
|
|
threadBlock.z = z;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setSharedMemSize(const size_t bytes) {
|
|
#ifdef CUDACC
|
|
sharedMemoryBytes = bytes;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setStream(const StreamID& stream_) {
|
|
#ifdef CUDACC
|
|
stream.id = stream_.id;
|
|
#endif
|
|
}
|
|
|
|
Settings basic(const size_t threads, const StreamID& stream) {
|
|
Settings sett;
|
|
#ifdef CUDACC
|
|
auto max_threads = DeviceProperties.maxThreadsPerBlock;
|
|
size_t grid_blocks = (threads + max_threads - 1) / max_threads; // ceil(threads / max_threads)
|
|
size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
|
|
sett.setGridDim(grid_blocks);
|
|
sett.setBlockDim(block_threads);
|
|
sett.setStream(stream);
|
|
#else
|
|
sett.threads = threads;
|
|
#endif
|
|
return sett;
|
|
}
|
|
} // namespace Kernel
|
|
|
|
/////////////////////
|
|
// Shape Functions //
|
|
/////////////////////
|
|
|
|
HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
|
|
CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
|
|
mAxes = dims.size();
|
|
if (mAxes == 0) return;
|
|
|
|
auto it = dims.end() - 1;
|
|
mItems = 1;
|
|
for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
|
|
uint32_t dim = *it;
|
|
CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");
|
|
|
|
mAxisDim[iAxis] = dim;
|
|
mStride[iAxis] = mItems;
|
|
mItems *= dim;
|
|
--it;
|
|
}
|
|
|
|
if (mAxes == 1) return;
|
|
// Swap last two, for column major storage.
|
|
mStride[mAxes - 2] = 1;
|
|
mStride[mAxes - 1] = mAxisDim[mAxes - 2];
|
|
}
|
|
|
|
HD uint32_t Shape::axes() const { return mAxes; };
|
|
HD uint32_t Shape::items() const { return mItems; };
|
|
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }
|
|
|
|
HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }
|
|
|
|
HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }
|
|
|
|
HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
|
|
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }
|
|
|
|
HD bool Shape::operator==(const Shape& s) const {
|
|
if (mAxes != s.mAxes) {
|
|
return false;
|
|
}
|
|
for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
|
|
if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }
|
|
|
|
HD Shape Shape::subshape(const uint32_t axis) const {
|
|
CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
|
|
if (axis == mAxes) return Shape({1});
|
|
|
|
Shape new_shape({});
|
|
new_shape.mAxes = mAxes - axis;
|
|
new_shape.mItems = mItems;
|
|
|
|
for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
|
|
new_shape.mItems /= mAxisDim[iAxis];
|
|
}
|
|
for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
|
|
new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
|
|
new_shape.mStride[iAxis - axis] = mStride[iAxis];
|
|
}
|
|
return new_shape;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, const Shape& s) {
|
|
out << "(";
|
|
if (s.axes() == 0) return out << ")";
|
|
for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
|
|
out << s.dim(iAxis) << ", ";
|
|
}
|
|
return out << s.dim(s.axes() - 1) << ")";
|
|
}
|
|
|
|
#ifdef CUDACC
|
|
const char* cublasGetErrorString(cublasStatus_t error) {
|
|
switch (error) {
|
|
case CUBLAS_STATUS_SUCCESS:
|
|
return "CUBLAS_STATUS_SUCCESS";
|
|
|
|
case CUBLAS_STATUS_NOT_INITIALIZED:
|
|
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
|
|
case CUBLAS_STATUS_ALLOC_FAILED:
|
|
return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
|
|
case CUBLAS_STATUS_INVALID_VALUE:
|
|
return "CUBLAS_STATUS_INVALID_VALUE";
|
|
|
|
case CUBLAS_STATUS_ARCH_MISMATCH:
|
|
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
|
|
case CUBLAS_STATUS_MAPPING_ERROR:
|
|
return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
|
|
case CUBLAS_STATUS_EXECUTION_FAILED:
|
|
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
|
|
case CUBLAS_STATUS_INTERNAL_ERROR:
|
|
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
}
|
|
|
|
return "<unknown>";
|
|
}
|
|
#endif
|
|
|
|
}; // namespace CudaTools
|
|
#endif // CUDATOOLS_IMPLEMENTATION
|
|
|
|
#endif // CUDATOOLS_H
|
|
|