You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							683 lines
						
					
					
						
							19 KiB
						
					
					
				
			
		
		
	
	
							683 lines
						
					
					
						
							19 KiB
						
					
					
				| #ifndef CUDATOOLS_H
 | |
| #define CUDATOOLS_H
 | |
| 
 | |
| #include "Macros.h"
 | |
| #include <functional>
 | |
| #include <iostream>
 | |
| #include <string>
 | |
| #include <tuple>
 | |
| #include <unordered_map>
 | |
| #include <vector>
 | |
| 
 | |
| namespace CudaTools {
 | |
| 
 | |
| struct Event;
 | |
| /**
 | |
|  * Simple wrapper for the name of a stream. Its purposes is to allow for
 | |
|  * 'streams' to be passed on host code, and allowing for simple syntax
 | |
|  * for waiting.
 | |
|  */
 | |
| struct StreamID {
 | |
|   public:
 | |
|     std::string mId;
 | |
|     StreamID() : mId(""){};
 | |
|     /**
 | |
|      * The constructor for a StreamID.
 | |
|      */
 | |
|     StreamID(const std::string& id_) : mId(id_){};
 | |
|     StreamID(const char* id_) : mId(id_){};
 | |
| 
 | |
|     void wait() const; /**< Makes host wait for this stream. */
 | |
|     /**
 | |
|      * Makes this stream wait for this event. Does not block the host.
 | |
|      */
 | |
|     void wait(const Event& event) const;
 | |
| };
 | |
| 
 | |
| static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
 | |
| static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
 | |
| static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
 | |
| 
 | |
| /**
 | |
|  * Allocates memory on the device.
 | |
|  */
 | |
| void* malloc(const size_t size);
 | |
| 
 | |
| /**
 | |
|  * Frees memory on the device.
 | |
|  */
 | |
| void free(void* const pDevice);
 | |
| 
 | |
| /**
 | |
|  * Copies memory from the source pointer to the dest pointer.
 | |
|  */
 | |
| StreamID copy(void* const source, void* const dest, const size_t size,
 | |
|               const StreamID& stream = DEF_MEM_STREAM);
 | |
| /**
 | |
|  * Initializes or sets device memory to a value.
 | |
|  */
 | |
| StreamID memset(void* const pDevice, int value, const size_t size,
 | |
|                 const StreamID& stream = DEF_MEM_STREAM);
 | |
| /**
 | |
|  * Pins memory on the host.
 | |
|  */
 | |
| void pin(void* const pHost, const size_t size);
 | |
| 
 | |
| #ifdef CUDACC
 | |
| cudaDeviceProp getDeviceProp();
 | |
| static cudaDeviceProp DeviceProperties = getDeviceProp();
 | |
| const char* cublasGetErrorString(cublasStatus_t status);
 | |
| #endif
 | |
| 
 | |
| /**
 | |
|  * A class that manages various CUDA Runtime components, such as
 | |
|  * streams, events, and handles.
 | |
|  */
 | |
| class Manager {
 | |
|   private:
 | |
|     static Manager mManagerInstance;
 | |
|     Manager(const std::vector<std::string>& names);
 | |
|     ~Manager();
 | |
| #ifdef CUDACC
 | |
|     std::unordered_map<std::string, cudaStream_t> mStreams;
 | |
|     cublasHandle_t mCublas;
 | |
|     // cusparseHandle_t mCusparse;
 | |
| #endif
 | |
|   public:
 | |
|     /**
 | |
|      * Used to get the global CudaTools::Manager instance.
 | |
|      */
 | |
|     static Manager* get() { return &mManagerInstance; };
 | |
| 
 | |
|     void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
 | |
|     void sync() const;                          /**< Waits until all device code has finished. */
 | |
|     void addStream(const std::string& name);    /**< Creates a stream with the given name. */
 | |
| #ifdef CUDACC
 | |
|     cudaStream_t stream(const StreamID& stream) const;
 | |
|     cublasHandle_t cublasHandle() const;
 | |
|     // cusparseHandle_t cusparseHandle() const;
 | |
| #endif
 | |
| };
 | |
| 
 | |
| namespace Kernel {
 | |
| 
 | |
| /**
 | |
|  * A struct that contains the kernel launch parameters.
 | |
|  */
 | |
| struct Settings {
 | |
|   public:
 | |
| #ifdef CUDACC
 | |
|     dim3 blockGrid;
 | |
|     dim3 threadBlock;
 | |
|     size_t sharedMemoryBytes = 0;
 | |
| #else
 | |
|     size_t threads;
 | |
| #endif
 | |
|     StreamID stream;
 | |
| 
 | |
|     Settings() = default;
 | |
| 
 | |
|     void setGridDim(const size_t x);                 /**< Sets the Grid dimensions. */
 | |
|     void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
 | |
|     void setGridDim(const size_t x, const size_t y,
 | |
|                     const size_t z);                  /**< Sets the Grid dimensions. */
 | |
|     void setBlockDim(const size_t x);                 /**< Sets the Thread Block dimensions. */
 | |
|     void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
 | |
|     void setBlockDim(const size_t x, const size_t y,
 | |
|                      const size_t z); /**< Sets the Thread Block dimensions. */
 | |
| 
 | |
|     void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
 | |
|     void setStream(const StreamID& stream);    /**< Sets the stream. */
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * Returns a kernel launch parameters based on the number of threads, and optionally
 | |
|  * a stream. Should only be used for 'embarassingly parallel' situations, or where
 | |
|  * each thread corresponds some sort of index.
 | |
|  */
 | |
| Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);
 | |
| 
 | |
| /**
 | |
|  * Launches a kernel with the provided function, settings and its arguments.
 | |
|  */
 | |
| 
 | |
| template <typename F, typename... Args>
 | |
| StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
 | |
| #ifdef CUDACC
 | |
|     func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
 | |
|            Manager::get()->stream(sett.stream.mId)>>>(args...);
 | |
| #else
 | |
|     func(args...);
 | |
| #endif
 | |
|     return sett.stream;
 | |
| }
 | |
| }; // namespace Kernel
 | |
| 
 | |
| template <typename T> class Array;
 | |
| 
 | |
| /**
 | |
|  * A class that holds information about an Array.
 | |
|  */
 | |
| class Shape {
 | |
|   private:
 | |
|     template <typename T> friend class Array;
 | |
|     uint32_t mAxes;
 | |
|     uint32_t mItems;
 | |
|     uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
 | |
|     uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};
 | |
| 
 | |
|   public:
 | |
|     HD Shape() : mAxes(0), mItems(1){};
 | |
|     /**
 | |
|      * The constructor for a Shape.
 | |
|      * \param dims an initializer list of the dimensions.
 | |
|      */
 | |
|     HD Shape(const std::initializer_list<uint32_t> dims);
 | |
| 
 | |
|     HD uint32_t axes() const;  /**< Gets the number of axes. */
 | |
|     HD uint32_t items() const; /**< Gets the total number of items. */
 | |
| 
 | |
|     HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
 | |
|                                    dimension of the last axis. */
 | |
|     HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
 | |
|                                    dimension of the second to last axis. */
 | |
|     HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets
 | |
|                                    the dimension of the second to last axis. */
 | |
| 
 | |
|     HD uint32_t
 | |
|     dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
 | |
|     HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */
 | |
| 
 | |
|     /**
 | |
|      * Gets the shape at a specific axis of this shape.
 | |
|      * \param axis the axis of where the new shape starts.
 | |
|      */
 | |
|     HD Shape subshape(const uint32_t axis) const;
 | |
| 
 | |
|     HD bool operator==(const Shape& s) const; /**< Equals operator. */
 | |
|     HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
 | |
| };
 | |
| 
 | |
| std::ostream& operator<<(std::ostream& out, const Shape& s);
 | |
| 
 | |
| /**
 | |
|  * A simple class that manages a CUDA Event.
 | |
|  */
 | |
| struct Event {
 | |
| #ifdef CUDACC
 | |
|     cudaEvent_t mEvent;
 | |
| #endif
 | |
|     Event();
 | |
|     ~Event();
 | |
|     void record(const StreamID& stream); /**< Records a event from a stream. */
 | |
| };
 | |
| 
 | |
| template <typename F, typename... Args> struct FuncHolder {
 | |
|     F mFunc;
 | |
|     std::tuple<Args...> mArgs;
 | |
|     FuncHolder() = delete;
 | |
|     FuncHolder(F func, Args... args) : mFunc(func), mArgs(std::make_tuple(args...)){};
 | |
|     static void run(void* data) {
 | |
|         FuncHolder<F, Args...>* fh = (FuncHolder<F, Args...>*)(data);
 | |
|         std::apply([fh](auto&&... args) { fh->mFunc(args...); }, fh->mArgs);
 | |
|     };
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * A class that manages CUDA Graphs.
 | |
|  */
 | |
| template <typename F, typename... Args> class Graph {
 | |
|   private:
 | |
| #ifdef CUDACC
 | |
|     cudaGraph_t mGraph;
 | |
|     cudaGraphExec_t mInstance;
 | |
| #endif
 | |
|     FuncHolder<F, Args...> mFuncHolder;
 | |
|     StreamID mStream;
 | |
| 
 | |
|   public:
 | |
|     Graph() = delete;
 | |
|     /**
 | |
|      * The constructor for a Graph, which captures the function.
 | |
|      * \param func the function to capture.
 | |
|      * \param stream the origin stream to use.
 | |
|      * \param args the arguments of the function.
 | |
|      */
 | |
|     Graph(const StreamID& stream, F func, Args... args)
 | |
|         : mFuncHolder(func, args...), mStream(stream) {
 | |
| #ifdef CUDACC
 | |
|         CUDA_CHECK(
 | |
|             cudaStreamBeginCapture(Manager::get()->stream(mStream), cudaStreamCaptureModeGlobal));
 | |
|         mFuncHolder.run((void*)&mFuncHolder);
 | |
|         CUDA_CHECK(cudaStreamEndCapture(Manager::get()->stream(mStream), &mGraph));
 | |
|         CUDA_CHECK(cudaGraphInstantiate(&mInstance, mGraph, NULL, NULL, 0));
 | |
| #endif
 | |
|     };
 | |
| 
 | |
|     ~Graph() {
 | |
| #ifdef CUDACC
 | |
|         cudaGraphDestroy(mGraph);
 | |
|         cudaGraphExecDestroy(mInstance);
 | |
| #endif
 | |
|     };
 | |
| 
 | |
|     /**
 | |
|      * Executes the instantiated graph, or simply runs the function with provided
 | |
|      * arguments if compiling for CPU.
 | |
|      */
 | |
|     StreamID execute() const {
 | |
| #ifdef CUDACC
 | |
|         cudaGraphLaunch(mInstance, Manager::get()->stream(mStream));
 | |
| #else
 | |
|         mFuncHolder.run((void*)&mFuncHolder);
 | |
| #endif
 | |
|         return mStream;
 | |
|     }
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
 | |
|  */
 | |
| struct GraphManager {
 | |
|     std::vector<void*> mHostData;
 | |
|     std::vector<Event*> mEvents;
 | |
| 
 | |
|     ~GraphManager();
 | |
| 
 | |
|     /**
 | |
|      * Within a function that is being stream captured, launch a host function that can
 | |
|      * be captured into the graph.
 | |
|      */
 | |
| 
 | |
|     template <typename F, typename... Args>
 | |
|     void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
 | |
| #ifdef CUDACC
 | |
|         FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
 | |
|         mHostData.push_back((void*)fh);
 | |
|         cudaHostFn_t run_func = fh->run;
 | |
|         CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
 | |
| #else
 | |
|         func(args...);
 | |
| #endif
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Makes a new branch in the graph to be run in parallel by a new stream.
 | |
|      * \param orig_stream the original stream to branch from.
 | |
|      * \param branch_stream the stream of the new branch.
 | |
|      */
 | |
|     void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
 | |
|     /**
 | |
|      * Joins a existing branch in the graph to collapse a parallel block.
 | |
|      * \param orig_stream the original stream to join the branch to.
 | |
|      * \param branch_stream the stream of the branch to join.
 | |
|      */
 | |
|     void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
 | |
| };
 | |
| 
 | |
| }; // namespace CudaTools
 | |
| #endif // CUDATOOLS_H
 | |
| 
 | |
| #ifdef CUDATOOLS_IMPLEMENTATION
 | |
| #ifndef __CUDATOOLS_IMPLEMENTED__
 | |
| #define __CUDATOOLS_IMPLEMENTED__
 | |
| namespace CudaTools {
 | |
| 
 | |
| //////////////////////
 | |
| // StreamID Methods //
 | |
| //////////////////////
 | |
| 
 | |
| void StreamID::wait() const { Manager::get()->waitFor(mId); }
 | |
| 
 | |
| void StreamID::wait(const Event& event) const {
 | |
| #ifdef CUDACC
 | |
|     CUDA_CHECK(cudaStreamWaitEvent(Manager::get()->stream(mId), event.mEvent, 0));
 | |
| #endif
 | |
| }
 | |
| 
 | |
| ////////////////////
 | |
| // Memory Methods //
 | |
| ////////////////////
 | |
| 
 | |
| void* malloc(const size_t size) {
 | |
| #ifdef CUDACC
 | |
|     void* pDevice;
 | |
|     CUDA_CHECK(cudaMalloc(&pDevice, size));
 | |
|     return pDevice;
 | |
| #else
 | |
|     return nullptr;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void free(void* const pDevice) {
 | |
| #ifdef CUDACC
 | |
|     if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
 | |
| #endif
 | |
| }
 | |
| 
 | |
| StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
 | |
| #ifdef CUDACC
 | |
|     CUDA_CHECK(
 | |
|         cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
 | |
| #endif
 | |
|     return stream;
 | |
| }
 | |
| 
 | |
| StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
 | |
| #ifdef CUDACC
 | |
|     CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
 | |
| #endif
 | |
|     return stream;
 | |
| }
 | |
| 
 | |
| void pin(void* const pHost, const size_t size) {
 | |
| #ifdef CUDACC
 | |
|     CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
 | |
| #endif
 | |
| }
 | |
| 
 | |
| #ifdef CUDACC
 | |
| cudaDeviceProp getDeviceProp() {
 | |
|     cudaSetDevice(0);
 | |
|     cudaDeviceProp deviceProp;
 | |
|     cudaGetDeviceProperties(&deviceProp, 0);
 | |
|     return deviceProp;
 | |
| }
 | |
| #endif
 | |
| 
 | |
| /////////////////////
 | |
| // Manager Methods //
 | |
| /////////////////////
 | |
| 
 | |
| Manager::Manager(const std::vector<std::string>& names) {
 | |
| #ifdef CUDACC
 | |
|     for (auto name : names) {
 | |
|         addStream(name);
 | |
|     }
 | |
|     CUBLAS_CHECK(cublasCreate(&mCublas));
 | |
|     // CUSPARSE_CHECK(cusparseCreate(&mCusparse));
 | |
| #endif
 | |
| }
 | |
| 
 | |
| Manager::~Manager() {
 | |
| #ifdef CUDACC
 | |
|     for (auto& it : mStreams) {
 | |
|         cudaStreamDestroy(it.second);
 | |
|     }
 | |
|     cublasDestroy(mCublas);
 | |
|     // CUSPARSE_CHECK(cusparseDestroy(mCusparse));
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Manager::waitFor(const StreamID& stream) const {
 | |
| #ifdef CUDACC
 | |
|     auto it = mStreams.find(stream.mId);
 | |
|     if (it != mStreams.end()) {
 | |
|         CUDA_CHECK(cudaStreamSynchronize(it->second));
 | |
|     } else {
 | |
|         CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
 | |
|     }
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Manager::sync() const {
 | |
| #ifdef CUDACC
 | |
|     CUDA_CHECK(cudaDeviceSynchronize());
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Manager::addStream(const std::string& name) {
 | |
| #ifdef CUDACC
 | |
|     cudaStream_t s;
 | |
|     CUDA_CHECK(cudaStreamCreate(&s));
 | |
|     mStreams[name] = s;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| #ifdef CUDACC
 | |
| cudaStream_t Manager::stream(const StreamID& stream) const {
 | |
|     auto it = mStreams.find(stream.mId);
 | |
|     if (it != mStreams.end()) {
 | |
|         return it->second;
 | |
|     } else {
 | |
|         CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
 | |
|     }
 | |
| }
 | |
| 
 | |
| cublasHandle_t Manager::cublasHandle() const { return mCublas; };
 | |
| // cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };
 | |
| 
 | |
| Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
 | |
| #else
 | |
| Manager Manager::mManagerInstance = Manager({""});
 | |
| #endif
 | |
| 
 | |
| ////////////////////
 | |
| // Kernel Methods //
 | |
| ////////////////////
 | |
| 
 | |
| namespace Kernel {
 | |
| 
 | |
| void Settings::setGridDim(const size_t x) {
 | |
| #ifdef CUDACC
 | |
|     CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
 | |
|     blockGrid.x = x;
 | |
|     blockGrid.y = 1;
 | |
|     blockGrid.z = 1;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Settings::setGridDim(const size_t x, const size_t y) {
 | |
| #ifdef CUDACC
 | |
|     CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
 | |
|     CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
 | |
|     CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
 | |
|     blockGrid.x = x;
 | |
|     blockGrid.y = y;
 | |
|     blockGrid.z = 1;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
 | |
| #ifdef CUDACC
 | |
|     CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
 | |
|     CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
 | |
|     CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
 | |
|     CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
 | |
|     blockGrid.x = x;
 | |
|     blockGrid.y = y;
 | |
|     blockGrid.z = z;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Settings::setBlockDim(const size_t x) {
 | |
| #ifdef CUDACC
 | |
|     CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
 | |
|     threadBlock.x = x;
 | |
|     threadBlock.y = 1;
 | |
|     threadBlock.z = 1;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Settings::setBlockDim(const size_t x, const size_t y) {
 | |
| #ifdef CUDACC
 | |
|     CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
 | |
|     CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
 | |
|     CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
 | |
|     threadBlock.x = x;
 | |
|     threadBlock.y = y;
 | |
|     threadBlock.z = 1;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
 | |
| #ifdef CUDACC
 | |
|     CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
 | |
|     CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
 | |
|     CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
 | |
|     CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
 | |
|     threadBlock.x = x;
 | |
|     threadBlock.y = y;
 | |
|     threadBlock.z = z;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Settings::setSharedMemSize(const size_t bytes) {
 | |
| #ifdef CUDACC
 | |
|     sharedMemoryBytes = bytes;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Settings::setStream(const StreamID& stream_) {
 | |
| #ifdef CUDACC
 | |
|     stream = stream_;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| Settings basic(const size_t threads, const StreamID& stream) {
 | |
|     Settings sett;
 | |
| #ifdef CUDACC
 | |
|     auto max_threads = DeviceProperties.maxThreadsPerBlock;
 | |
|     size_t grid_blocks = (threads + max_threads - 1) / max_threads;   // ceil(threads / max_threads)
 | |
|     size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
 | |
|     sett.setGridDim(grid_blocks);
 | |
|     sett.setBlockDim(block_threads);
 | |
|     sett.setStream(stream);
 | |
| #else
 | |
|     sett.threads = threads;
 | |
| #endif
 | |
|     return sett;
 | |
| }
 | |
| 
 | |
| }; // namespace Kernel
 | |
| 
 | |
| /////////////////////
 | |
| // Shape Functions //
 | |
| /////////////////////
 | |
| 
 | |
| HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
 | |
|     CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
 | |
|     mAxes = dims.size();
 | |
|     if (mAxes == 0) return;
 | |
| 
 | |
|     auto it = dims.end() - 1;
 | |
|     mItems = 1;
 | |
|     for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
 | |
|         uint32_t dim = *it;
 | |
|         CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");
 | |
| 
 | |
|         mAxisDim[iAxis] = dim;
 | |
|         mStride[iAxis] = mItems;
 | |
|         mItems *= dim;
 | |
|         --it;
 | |
|     }
 | |
| 
 | |
|     if (mAxes == 1) return;
 | |
|     // Swap last two, for column major storage.
 | |
|     mStride[mAxes - 2] = 1;
 | |
|     mStride[mAxes - 1] = mAxisDim[mAxes - 2];
 | |
| }
 | |
| 
 | |
| HD uint32_t Shape::axes() const { return mAxes; };
 | |
| HD uint32_t Shape::items() const { return mItems; };
 | |
| HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }
 | |
| 
 | |
| HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }
 | |
| 
 | |
| HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }
 | |
| 
 | |
| HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
 | |
| HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }
 | |
| 
 | |
| HD bool Shape::operator==(const Shape& s) const {
 | |
|     if (mAxes != s.mAxes) {
 | |
|         return false;
 | |
|     }
 | |
|     for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
 | |
|         if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
 | |
|             return false;
 | |
|         }
 | |
|     }
 | |
|     return true;
 | |
| }
 | |
| 
 | |
| HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }
 | |
| 
 | |
| HD Shape Shape::subshape(const uint32_t axis) const {
 | |
|     CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
 | |
|     if (axis == mAxes) return Shape({1});
 | |
| 
 | |
|     Shape new_shape({});
 | |
|     new_shape.mAxes = mAxes - axis;
 | |
|     new_shape.mItems = mItems;
 | |
| 
 | |
|     for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
 | |
|         new_shape.mItems /= mAxisDim[iAxis];
 | |
|     }
 | |
|     for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
 | |
|         new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
 | |
|         new_shape.mStride[iAxis - axis] = mStride[iAxis];
 | |
|     }
 | |
|     return new_shape;
 | |
| }
 | |
| 
 | |
| std::ostream& operator<<(std::ostream& out, const Shape& s) {
 | |
|     out << "(";
 | |
|     if (s.axes() == 0) return out << ")";
 | |
|     for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
 | |
|         out << s.dim(iAxis) << ", ";
 | |
|     }
 | |
|     return out << s.dim(s.axes() - 1) << ")";
 | |
| }
 | |
| 
 | |
| ///////////////////
 | |
| // Event Methods //
 | |
| ///////////////////
 | |
| 
 | |
| Event::Event() {
 | |
| #ifdef CUDACC
 | |
|     CUDA_CHECK(cudaEventCreate(&mEvent));
 | |
| #endif
 | |
| }
 | |
| 
 | |
| Event::~Event() {
 | |
| #ifdef CUDACC
 | |
|     cudaEventDestroy(mEvent);
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void Event::record(const StreamID& stream) {
 | |
| #ifdef CUDACC
 | |
|     CUDA_CHECK(cudaEventRecord(mEvent, Manager::get()->stream(stream)));
 | |
| #endif
 | |
| }
 | |
| 
 | |
| //////////////////////////
 | |
| // GraphManager Methods //
 | |
| //////////////////////////
 | |
| 
 | |
| GraphManager::~GraphManager() {
 | |
| #ifdef CUDACC
 | |
|     for (Event* event : mEvents) {
 | |
|         delete event;
 | |
|     }
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
 | |
|     Event* event = new Event();
 | |
|     event->record(orig_stream);
 | |
|     mEvents.push_back(event);
 | |
|     branch_stream.wait(*event);
 | |
| }
 | |
| 
 | |
| void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
 | |
|     Event* event = new Event();
 | |
|     event->record(branch_stream);
 | |
|     mEvents.push_back(event);
 | |
|     orig_stream.wait(*event);
 | |
| }
 | |
| 
 | |
| };     // namespace CudaTools
 | |
| #endif
 | |
| #endif // CUDATOOLS_IMPLEMENTATION
 | |
| 
 |