You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
683 lines
19 KiB
683 lines
19 KiB
#ifndef CUDATOOLS_H
|
|
#define CUDATOOLS_H
|
|
|
|
#include "Macros.h"
|
|
#include <functional>
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace CudaTools {
|
|
|
|
struct Event;
|
|
/**
|
|
* Simple wrapper for the name of a stream. Its purposes is to allow for
|
|
* 'streams' to be passed on host code, and allowing for simple syntax
|
|
* for waiting.
|
|
*/
|
|
struct StreamID {
|
|
public:
|
|
std::string mId;
|
|
StreamID() : mId(""){};
|
|
/**
|
|
* The constructor for a StreamID.
|
|
*/
|
|
StreamID(const std::string& id_) : mId(id_){};
|
|
StreamID(const char* id_) : mId(id_){};
|
|
|
|
void wait() const; /**< Makes host wait for this stream. */
|
|
/**
|
|
* Makes this stream wait for this event. Does not block the host.
|
|
*/
|
|
void wait(const Event& event) const;
|
|
};
|
|
|
|
static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
|
|
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
|
|
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
|
|
|
|
/**
|
|
* Allocates memory on the device.
|
|
*/
|
|
void* malloc(const size_t size);
|
|
|
|
/**
|
|
* Frees memory on the device.
|
|
*/
|
|
void free(void* const pDevice);
|
|
|
|
/**
|
|
* Copies memory from the source pointer to the dest pointer.
|
|
*/
|
|
StreamID copy(void* const source, void* const dest, const size_t size,
|
|
const StreamID& stream = DEF_MEM_STREAM);
|
|
/**
|
|
* Initializes or sets device memory to a value.
|
|
*/
|
|
StreamID memset(void* const pDevice, int value, const size_t size,
|
|
const StreamID& stream = DEF_MEM_STREAM);
|
|
/**
|
|
* Pins memory on the host.
|
|
*/
|
|
void pin(void* const pHost, const size_t size);
|
|
|
|
#ifdef CUDACC
|
|
cudaDeviceProp getDeviceProp();
|
|
static cudaDeviceProp DeviceProperties = getDeviceProp();
|
|
const char* cublasGetErrorString(cublasStatus_t status);
|
|
#endif
|
|
|
|
/**
|
|
* A class that manages various CUDA Runtime components, such as
|
|
* streams, events, and handles.
|
|
*/
|
|
class Manager {
|
|
private:
|
|
static Manager mManagerInstance;
|
|
Manager(const std::vector<std::string>& names);
|
|
~Manager();
|
|
#ifdef CUDACC
|
|
std::unordered_map<std::string, cudaStream_t> mStreams;
|
|
cublasHandle_t mCublas;
|
|
// cusparseHandle_t mCusparse;
|
|
#endif
|
|
public:
|
|
/**
|
|
* Used to get the global CudaTools::Manager instance.
|
|
*/
|
|
static Manager* get() { return &mManagerInstance; };
|
|
|
|
void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
|
|
void sync() const; /**< Waits until all device code has finished. */
|
|
void addStream(const std::string& name); /**< Creates a stream with the given name. */
|
|
#ifdef CUDACC
|
|
cudaStream_t stream(const StreamID& stream) const;
|
|
cublasHandle_t cublasHandle() const;
|
|
// cusparseHandle_t cusparseHandle() const;
|
|
#endif
|
|
};
|
|
|
|
namespace Kernel {
|
|
|
|
/**
|
|
* A struct that contains the kernel launch parameters.
|
|
*/
|
|
struct Settings {
|
|
public:
|
|
#ifdef CUDACC
|
|
dim3 blockGrid;
|
|
dim3 threadBlock;
|
|
size_t sharedMemoryBytes = 0;
|
|
#else
|
|
size_t threads;
|
|
#endif
|
|
StreamID stream;
|
|
|
|
Settings() = default;
|
|
|
|
void setGridDim(const size_t x); /**< Sets the Grid dimensions. */
|
|
void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
|
|
void setGridDim(const size_t x, const size_t y,
|
|
const size_t z); /**< Sets the Grid dimensions. */
|
|
void setBlockDim(const size_t x); /**< Sets the Thread Block dimensions. */
|
|
void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
|
|
void setBlockDim(const size_t x, const size_t y,
|
|
const size_t z); /**< Sets the Thread Block dimensions. */
|
|
|
|
void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
|
|
void setStream(const StreamID& stream); /**< Sets the stream. */
|
|
};
|
|
|
|
/**
|
|
* Returns a kernel launch parameters based on the number of threads, and optionally
|
|
* a stream. Should only be used for 'embarassingly parallel' situations, or where
|
|
* each thread corresponds some sort of index.
|
|
*/
|
|
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);
|
|
|
|
/**
|
|
* Launches a kernel with the provided function, settings and its arguments.
|
|
*/
|
|
|
|
template <typename F, typename... Args>
|
|
StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
|
|
#ifdef CUDACC
|
|
func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
|
|
Manager::get()->stream(sett.stream.mId)>>>(args...);
|
|
#else
|
|
func(args...);
|
|
#endif
|
|
return sett.stream;
|
|
}
|
|
}; // namespace Kernel
|
|
|
|
template <typename T> class Array;
|
|
|
|
/**
|
|
* A class that holds information about an Array.
|
|
*/
|
|
class Shape {
|
|
private:
|
|
template <typename T> friend class Array;
|
|
uint32_t mAxes;
|
|
uint32_t mItems;
|
|
uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
|
|
uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};
|
|
|
|
public:
|
|
HD Shape() : mAxes(0), mItems(1){};
|
|
/**
|
|
* The constructor for a Shape.
|
|
* \param dims an initializer list of the dimensions.
|
|
*/
|
|
HD Shape(const std::initializer_list<uint32_t> dims);
|
|
|
|
HD uint32_t axes() const; /**< Gets the number of axes. */
|
|
HD uint32_t items() const; /**< Gets the total number of items. */
|
|
|
|
HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
|
|
dimension of the last axis. */
|
|
HD uint32_t rows() const; /**< For 2D shapes, gets the number of rows. In general, gets the
|
|
dimension of the second to last axis. */
|
|
HD uint32_t cols() const; /**< For 2D shapes, gets the number of columns. In general, gets
|
|
the dimension of the second to last axis. */
|
|
|
|
HD uint32_t
|
|
dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
|
|
HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */
|
|
|
|
/**
|
|
* Gets the shape at a specific axis of this shape.
|
|
* \param axis the axis of where the new shape starts.
|
|
*/
|
|
HD Shape subshape(const uint32_t axis) const;
|
|
|
|
HD bool operator==(const Shape& s) const; /**< Equals operator. */
|
|
HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& out, const Shape& s);
|
|
|
|
/**
|
|
* A simple class that manages a CUDA Event.
|
|
*/
|
|
struct Event {
|
|
#ifdef CUDACC
|
|
cudaEvent_t mEvent;
|
|
#endif
|
|
Event();
|
|
~Event();
|
|
void record(const StreamID& stream); /**< Records a event from a stream. */
|
|
};
|
|
|
|
template <typename F, typename... Args> struct FuncHolder {
|
|
F mFunc;
|
|
std::tuple<Args...> mArgs;
|
|
FuncHolder() = delete;
|
|
FuncHolder(F func, Args... args) : mFunc(func), mArgs(std::make_tuple(args...)){};
|
|
static void run(void* data) {
|
|
FuncHolder<F, Args...>* fh = (FuncHolder<F, Args...>*)(data);
|
|
std::apply([fh](auto&&... args) { fh->mFunc(args...); }, fh->mArgs);
|
|
};
|
|
};
|
|
|
|
/**
|
|
* A class that manages CUDA Graphs.
|
|
*/
|
|
template <typename F, typename... Args> class Graph {
|
|
private:
|
|
#ifdef CUDACC
|
|
cudaGraph_t mGraph;
|
|
cudaGraphExec_t mInstance;
|
|
#endif
|
|
FuncHolder<F, Args...> mFuncHolder;
|
|
StreamID mStream;
|
|
|
|
public:
|
|
Graph() = delete;
|
|
/**
|
|
* The constructor for a Graph, which captures the function.
|
|
* \param func the function to capture.
|
|
* \param stream the origin stream to use.
|
|
* \param args the arguments of the function.
|
|
*/
|
|
Graph(const StreamID& stream, F func, Args... args)
|
|
: mFuncHolder(func, args...), mStream(stream) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(
|
|
cudaStreamBeginCapture(Manager::get()->stream(mStream), cudaStreamCaptureModeGlobal));
|
|
mFuncHolder.run((void*)&mFuncHolder);
|
|
CUDA_CHECK(cudaStreamEndCapture(Manager::get()->stream(mStream), &mGraph));
|
|
CUDA_CHECK(cudaGraphInstantiate(&mInstance, mGraph, NULL, NULL, 0));
|
|
#endif
|
|
};
|
|
|
|
~Graph() {
|
|
#ifdef CUDACC
|
|
cudaGraphDestroy(mGraph);
|
|
cudaGraphExecDestroy(mInstance);
|
|
#endif
|
|
};
|
|
|
|
/**
|
|
* Executes the instantiated graph, or simply runs the function with provided
|
|
* arguments if compiling for CPU.
|
|
*/
|
|
StreamID execute() const {
|
|
#ifdef CUDACC
|
|
cudaGraphLaunch(mInstance, Manager::get()->stream(mStream));
|
|
#else
|
|
mFuncHolder.run((void*)&mFuncHolder);
|
|
#endif
|
|
return mStream;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
|
|
*/
|
|
struct GraphManager {
|
|
std::vector<void*> mHostData;
|
|
std::vector<Event*> mEvents;
|
|
|
|
~GraphManager();
|
|
|
|
/**
|
|
* Within a function that is being stream captured, launch a host function that can
|
|
* be captured into the graph.
|
|
*/
|
|
|
|
template <typename F, typename... Args>
|
|
void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
|
|
#ifdef CUDACC
|
|
FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
|
|
mHostData.push_back((void*)fh);
|
|
cudaHostFn_t run_func = fh->run;
|
|
CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
|
|
#else
|
|
func(args...);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* Makes a new branch in the graph to be run in parallel by a new stream.
|
|
* \param orig_stream the original stream to branch from.
|
|
* \param branch_stream the stream of the new branch.
|
|
*/
|
|
void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
|
/**
|
|
* Joins a existing branch in the graph to collapse a parallel block.
|
|
* \param orig_stream the original stream to join the branch to.
|
|
* \param branch_stream the stream of the branch to join.
|
|
*/
|
|
void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
|
|
};
|
|
|
|
}; // namespace CudaTools
|
|
#endif // CUDATOOLS_H
|
|
|
|
#ifdef CUDATOOLS_IMPLEMENTATION
|
|
#ifndef __CUDATOOLS_IMPLEMENTED__
|
|
#define __CUDATOOLS_IMPLEMENTED__
|
|
namespace CudaTools {
|
|
|
|
//////////////////////
|
|
// StreamID Methods //
|
|
//////////////////////
|
|
|
|
void StreamID::wait() const { Manager::get()->waitFor(mId); }
|
|
|
|
void StreamID::wait(const Event& event) const {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaStreamWaitEvent(Manager::get()->stream(mId), event.mEvent, 0));
|
|
#endif
|
|
}
|
|
|
|
////////////////////
|
|
// Memory Methods //
|
|
////////////////////
|
|
|
|
void* malloc(const size_t size) {
|
|
#ifdef CUDACC
|
|
void* pDevice;
|
|
CUDA_CHECK(cudaMalloc(&pDevice, size));
|
|
return pDevice;
|
|
#else
|
|
return nullptr;
|
|
#endif
|
|
}
|
|
|
|
void free(void* const pDevice) {
|
|
#ifdef CUDACC
|
|
if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
|
|
#endif
|
|
}
|
|
|
|
StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(
|
|
cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
|
|
#endif
|
|
return stream;
|
|
}
|
|
|
|
StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
|
|
#endif
|
|
return stream;
|
|
}
|
|
|
|
void pin(void* const pHost, const size_t size) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
|
|
#endif
|
|
}
|
|
|
|
#ifdef CUDACC
|
|
cudaDeviceProp getDeviceProp() {
|
|
cudaSetDevice(0);
|
|
cudaDeviceProp deviceProp;
|
|
cudaGetDeviceProperties(&deviceProp, 0);
|
|
return deviceProp;
|
|
}
|
|
#endif
|
|
|
|
/////////////////////
|
|
// Manager Methods //
|
|
/////////////////////
|
|
|
|
Manager::Manager(const std::vector<std::string>& names) {
|
|
#ifdef CUDACC
|
|
for (auto name : names) {
|
|
addStream(name);
|
|
}
|
|
CUBLAS_CHECK(cublasCreate(&mCublas));
|
|
// CUSPARSE_CHECK(cusparseCreate(&mCusparse));
|
|
#endif
|
|
}
|
|
|
|
Manager::~Manager() {
|
|
#ifdef CUDACC
|
|
for (auto& it : mStreams) {
|
|
cudaStreamDestroy(it.second);
|
|
}
|
|
cublasDestroy(mCublas);
|
|
// CUSPARSE_CHECK(cusparseDestroy(mCusparse));
|
|
#endif
|
|
}
|
|
|
|
void Manager::waitFor(const StreamID& stream) const {
|
|
#ifdef CUDACC
|
|
auto it = mStreams.find(stream.mId);
|
|
if (it != mStreams.end()) {
|
|
CUDA_CHECK(cudaStreamSynchronize(it->second));
|
|
} else {
|
|
CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void Manager::sync() const {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
#endif
|
|
}
|
|
|
|
void Manager::addStream(const std::string& name) {
|
|
#ifdef CUDACC
|
|
cudaStream_t s;
|
|
CUDA_CHECK(cudaStreamCreate(&s));
|
|
mStreams[name] = s;
|
|
#endif
|
|
}
|
|
|
|
#ifdef CUDACC
|
|
cudaStream_t Manager::stream(const StreamID& stream) const {
|
|
auto it = mStreams.find(stream.mId);
|
|
if (it != mStreams.end()) {
|
|
return it->second;
|
|
} else {
|
|
CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
|
|
}
|
|
}
|
|
|
|
cublasHandle_t Manager::cublasHandle() const { return mCublas; };
|
|
// cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
|
|
|
|
Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
|
|
#else
|
|
Manager Manager::mManagerInstance = Manager({""});
|
|
#endif
|
|
|
|
////////////////////
|
|
// Kernel Methods //
|
|
////////////////////
|
|
|
|
namespace Kernel {
|
|
|
|
void Settings::setGridDim(const size_t x) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
|
|
blockGrid.x = x;
|
|
blockGrid.y = 1;
|
|
blockGrid.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setGridDim(const size_t x, const size_t y) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
|
|
blockGrid.x = x;
|
|
blockGrid.y = y;
|
|
blockGrid.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
|
|
CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
|
|
blockGrid.x = x;
|
|
blockGrid.y = y;
|
|
blockGrid.z = z;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setBlockDim(const size_t x) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
|
|
threadBlock.x = x;
|
|
threadBlock.y = 1;
|
|
threadBlock.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setBlockDim(const size_t x, const size_t y) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
|
|
threadBlock.x = x;
|
|
threadBlock.y = y;
|
|
threadBlock.z = 1;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
|
|
#ifdef CUDACC
|
|
CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
|
|
CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
|
|
CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
|
|
CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
|
|
threadBlock.x = x;
|
|
threadBlock.y = y;
|
|
threadBlock.z = z;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setSharedMemSize(const size_t bytes) {
|
|
#ifdef CUDACC
|
|
sharedMemoryBytes = bytes;
|
|
#endif
|
|
}
|
|
|
|
void Settings::setStream(const StreamID& stream_) {
|
|
#ifdef CUDACC
|
|
stream = stream_;
|
|
#endif
|
|
}
|
|
|
|
Settings basic(const size_t threads, const StreamID& stream) {
|
|
Settings sett;
|
|
#ifdef CUDACC
|
|
auto max_threads = DeviceProperties.maxThreadsPerBlock;
|
|
size_t grid_blocks = (threads + max_threads - 1) / max_threads; // ceil(threads / max_threads)
|
|
size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
|
|
sett.setGridDim(grid_blocks);
|
|
sett.setBlockDim(block_threads);
|
|
sett.setStream(stream);
|
|
#else
|
|
sett.threads = threads;
|
|
#endif
|
|
return sett;
|
|
}
|
|
|
|
}; // namespace Kernel
|
|
|
|
/////////////////////
|
|
// Shape Functions //
|
|
/////////////////////
|
|
|
|
HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
|
|
CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
|
|
mAxes = dims.size();
|
|
if (mAxes == 0) return;
|
|
|
|
auto it = dims.end() - 1;
|
|
mItems = 1;
|
|
for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
|
|
uint32_t dim = *it;
|
|
CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");
|
|
|
|
mAxisDim[iAxis] = dim;
|
|
mStride[iAxis] = mItems;
|
|
mItems *= dim;
|
|
--it;
|
|
}
|
|
|
|
if (mAxes == 1) return;
|
|
// Swap last two, for column major storage.
|
|
mStride[mAxes - 2] = 1;
|
|
mStride[mAxes - 1] = mAxisDim[mAxes - 2];
|
|
}
|
|
|
|
HD uint32_t Shape::axes() const { return mAxes; };
|
|
HD uint32_t Shape::items() const { return mItems; };
|
|
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }
|
|
|
|
HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }
|
|
|
|
HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }
|
|
|
|
HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
|
|
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }
|
|
|
|
HD bool Shape::operator==(const Shape& s) const {
|
|
if (mAxes != s.mAxes) {
|
|
return false;
|
|
}
|
|
for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
|
|
if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }
|
|
|
|
HD Shape Shape::subshape(const uint32_t axis) const {
|
|
CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
|
|
if (axis == mAxes) return Shape({1});
|
|
|
|
Shape new_shape({});
|
|
new_shape.mAxes = mAxes - axis;
|
|
new_shape.mItems = mItems;
|
|
|
|
for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
|
|
new_shape.mItems /= mAxisDim[iAxis];
|
|
}
|
|
for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
|
|
new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
|
|
new_shape.mStride[iAxis - axis] = mStride[iAxis];
|
|
}
|
|
return new_shape;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, const Shape& s) {
|
|
out << "(";
|
|
if (s.axes() == 0) return out << ")";
|
|
for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
|
|
out << s.dim(iAxis) << ", ";
|
|
}
|
|
return out << s.dim(s.axes() - 1) << ")";
|
|
}
|
|
|
|
///////////////////
|
|
// Event Methods //
|
|
///////////////////
|
|
|
|
Event::Event() {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaEventCreate(&mEvent));
|
|
#endif
|
|
}
|
|
|
|
Event::~Event() {
|
|
#ifdef CUDACC
|
|
cudaEventDestroy(mEvent);
|
|
#endif
|
|
}
|
|
|
|
void Event::record(const StreamID& stream) {
|
|
#ifdef CUDACC
|
|
CUDA_CHECK(cudaEventRecord(mEvent, Manager::get()->stream(stream)));
|
|
#endif
|
|
}
|
|
|
|
//////////////////////////
|
|
// GraphManager Methods //
|
|
//////////////////////////
|
|
|
|
GraphManager::~GraphManager() {
|
|
#ifdef CUDACC
|
|
for (Event* event : mEvents) {
|
|
delete event;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
|
Event* event = new Event();
|
|
event->record(orig_stream);
|
|
mEvents.push_back(event);
|
|
branch_stream.wait(*event);
|
|
}
|
|
|
|
void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
|
|
Event* event = new Event();
|
|
event->record(branch_stream);
|
|
mEvents.push_back(event);
|
|
orig_stream.wait(*event);
|
|
}
|
|
|
|
}; // namespace CudaTools
|
|
#endif
|
|
#endif // CUDATOOLS_IMPLEMENTATION
|
|
|