CudaTools/include/Core.h

#ifndef CUDATOOLS_H
#define CUDATOOLS_H

#include "Macros.h"
#include <functional>
#include <iostream>
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>

namespace CudaTools {

struct Event;
/**
 * Simple wrapper for the name of a stream. Its purposes is to allow for
 * 'streams' to be passed on host code, and allowing for simple syntax
 * for waiting.
 */
struct StreamID {
  public:
    std::string mId;
    StreamID() : mId(""){};
    /**
     * The constructor for a StreamID.
     */
    StreamID(const std::string& id_) : mId(id_){};
    StreamID(const char* id_) : mId(id_){};

    void wait() const; /**< Makes host wait for this stream. */
    /**
     * Makes this stream wait for this event. Does not block the host.
     */
    void wait(const Event& event) const;
};

static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};

/**
 * Allocates memory on the device.
 */
void* malloc(const size_t size);

/**
 * Frees memory on the device.
 */
void free(void* const pDevice);

/**
 * Copies memory from the source pointer to the dest pointer.
 */
StreamID copy(void* const source, void* const dest, const size_t size,
              const StreamID& stream = DEF_MEM_STREAM);
/**
 * Initializes or sets device memory to a value.
 */
StreamID memset(void* const pDevice, int value, const size_t size,
                const StreamID& stream = DEF_MEM_STREAM);
/**
 * Pins memory on the host.
 */
void pin(void* const pHost, const size_t size);

#ifdef CUDACC
cudaDeviceProp getDeviceProp();
static cudaDeviceProp DeviceProperties = getDeviceProp();
const char* cublasGetErrorString(cublasStatus_t status);
#endif

/**
 * A class that manages various CUDA Runtime components, such as
 * streams, events, and handles.
 */
class Manager {
  private:
    static Manager mManagerInstance;
    Manager(const std::vector<std::string>& names);
    ~Manager();
#ifdef CUDACC
    std::unordered_map<std::string, cudaStream_t> mStreams;
    cublasHandle_t mCublas;
    // cusparseHandle_t mCusparse;
#endif
  public:
    /**
     * Used to get the global CudaTools::Manager instance.
     */
    static Manager* get() { return &mManagerInstance; };

    void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
    void sync() const;                          /**< Waits until all device code has finished. */
    void addStream(const std::string& name);    /**< Creates a stream with the given name. */
#ifdef CUDACC
    cudaStream_t stream(const StreamID& stream) const;
    cublasHandle_t cublasHandle() const;
    // cusparseHandle_t cusparseHandle() const;
#endif
};

namespace Kernel {

/**
 * A struct that contains the kernel launch parameters.
 */
struct Settings {
  public:
#ifdef CUDACC
    dim3 blockGrid;
    dim3 threadBlock;
    size_t sharedMemoryBytes = 0;
#else
    size_t threads;
#endif
    StreamID stream;

    Settings() = default;

    void setGridDim(const size_t x);                 /**< Sets the Grid dimensions. */
    void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
    void setGridDim(const size_t x, const size_t y,
                    const size_t z);                  /**< Sets the Grid dimensions. */
    void setBlockDim(const size_t x);                 /**< Sets the Thread Block dimensions. */
    void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
    void setBlockDim(const size_t x, const size_t y,
                     const size_t z); /**< Sets the Thread Block dimensions. */

    void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
    void setStream(const StreamID& stream);    /**< Sets the stream. */
};

/**
 * Returns a kernel launch parameters based on the number of threads, and optionally
 * a stream. Should only be used for 'embarassingly parallel' situations, or where
 * each thread corresponds some sort of index.
 */
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);

/**
 * Launches a kernel with the provided function, settings and its arguments.
 */

template <typename F, typename... Args>
StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
#ifdef CUDACC
    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
           Manager::get()->stream(sett.stream.mId)>>>(args...);
#else
    func(args...);
#endif
    return sett.stream;
}
}; // namespace Kernel

template <typename T> class Array;

/**
 * A class that holds information about an Array.
 */
class Shape {
  private:
    template <typename T> friend class Array;
    uint32_t mAxes;
    uint32_t mItems;
    uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
    uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};

  public:
    HD Shape() : mAxes(0), mItems(1){};
    /**
     * The constructor for a Shape.
     * \param dims an initializer list of the dimensions.
     */
    HD Shape(const std::initializer_list<uint32_t> dims);

    HD uint32_t axes() const;  /**< Gets the number of axes. */
    HD uint32_t items() const; /**< Gets the total number of items. */

    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
                                   dimension of the last axis. */
    HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
                                   dimension of the second to last axis. */
    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets
                                   the dimension of the second to last axis. */

    HD uint32_t
    dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
    HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */

    /**
     * Gets the shape at a specific axis of this shape.
     * \param axis the axis of where the new shape starts.
     */
    HD Shape subshape(const uint32_t axis) const;

    HD bool operator==(const Shape& s) const; /**< Equals operator. */
    HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
};

std::ostream& operator<<(std::ostream& out, const Shape& s);

/**
 * A simple class that manages a CUDA Event.
 */
struct Event {
#ifdef CUDACC
    cudaEvent_t mEvent;
#endif
    Event();
    ~Event();
    void record(const StreamID& stream); /**< Records a event from a stream. */
};

template <typename F, typename... Args> struct FuncHolder {
    F mFunc;
    std::tuple<Args...> mArgs;
    FuncHolder() = delete;
    FuncHolder(F func, Args... args) : mFunc(func), mArgs(std::make_tuple(args...)){};
    static void run(void* data) {
        FuncHolder<F, Args...>* fh = (FuncHolder<F, Args...>*)(data);
        std::apply([fh](auto&&... args) { fh->mFunc(args...); }, fh->mArgs);
    };
};

/**
 * A class that manages CUDA Graphs.
 */
template <typename F, typename... Args> class Graph {
  private:
#ifdef CUDACC
    cudaGraph_t mGraph;
    cudaGraphExec_t mInstance;
#endif
    FuncHolder<F, Args...> mFuncHolder;
    StreamID mStream;

  public:
    Graph() = delete;
    /**
     * The constructor for a Graph, which captures the function.
     * \param func the function to capture.
     * \param stream the origin stream to use.
     * \param args the arguments of the function.
     */
    Graph(const StreamID& stream, F func, Args... args)
        : mFuncHolder(func, args...), mStream(stream) {
#ifdef CUDACC
        CUDA_CHECK(
            cudaStreamBeginCapture(Manager::get()->stream(mStream), cudaStreamCaptureModeGlobal));
        mFuncHolder.run((void*)&mFuncHolder);
        CUDA_CHECK(cudaStreamEndCapture(Manager::get()->stream(mStream), &mGraph));
        CUDA_CHECK(cudaGraphInstantiate(&mInstance, mGraph, NULL, NULL, 0));
#endif
    };

    ~Graph() {
#ifdef CUDACC
        cudaGraphDestroy(mGraph);
        cudaGraphExecDestroy(mInstance);
#endif
    };

    /**
     * Executes the instantiated graph, or simply runs the function with provided
     * arguments if compiling for CPU.
     */
    StreamID execute() const {
#ifdef CUDACC
        cudaGraphLaunch(mInstance, Manager::get()->stream(mStream));
#else
        mFuncHolder.run((void*)&mFuncHolder);
#endif
        return mStream;
    }
};

/**
 * A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
 */
struct GraphManager {
    std::vector<void*> mHostData;
    std::vector<Event*> mEvents;

    ~GraphManager();

    /**
     * Within a function that is being stream captured, launch a host function that can
     * be captured into the graph.
     */

    template <typename F, typename... Args>
    void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
#ifdef CUDACC
        FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
        mHostData.push_back((void*)fh);
        cudaHostFn_t run_func = fh->run;
        CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
#else
        func(args...);
#endif
    }

    /**
     * Makes a new branch in the graph to be run in parallel by a new stream.
     * \param orig_stream the original stream to branch from.
     * \param branch_stream the stream of the new branch.
     */
    void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
    /**
     * Joins a existing branch in the graph to collapse a parallel block.
     * \param orig_stream the original stream to join the branch to.
     * \param branch_stream the stream of the branch to join.
     */
    void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
};

}; // namespace CudaTools
#endif // CUDATOOLS_H

#ifdef CUDATOOLS_IMPLEMENTATION
#ifndef __CUDATOOLS_IMPLEMENTED__
#define __CUDATOOLS_IMPLEMENTED__
namespace CudaTools {

//////////////////////
// StreamID Methods //
//////////////////////

void StreamID::wait() const { Manager::get()->waitFor(mId); }

void StreamID::wait(const Event& event) const {
#ifdef CUDACC
    CUDA_CHECK(cudaStreamWaitEvent(Manager::get()->stream(mId), event.mEvent, 0));
#endif
}

////////////////////
// Memory Methods //
////////////////////

void* malloc(const size_t size) {
#ifdef CUDACC
    void* pDevice;
    CUDA_CHECK(cudaMalloc(&pDevice, size));
    return pDevice;
#else
    return nullptr;
#endif
}

void free(void* const pDevice) {
#ifdef CUDACC
    if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
#endif
}

StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
#ifdef CUDACC
    CUDA_CHECK(
        cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
#endif
    return stream;
}

StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
#ifdef CUDACC
    CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
#endif
    return stream;
}

void pin(void* const pHost, const size_t size) {
#ifdef CUDACC
    CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
#endif
}

#ifdef CUDACC
cudaDeviceProp getDeviceProp() {
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    return deviceProp;
}
#endif

/////////////////////
// Manager Methods //
/////////////////////

Manager::Manager(const std::vector<std::string>& names) {
#ifdef CUDACC
    for (auto name : names) {
        addStream(name);
    }
    CUBLAS_CHECK(cublasCreate(&mCublas));
    // CUSPARSE_CHECK(cusparseCreate(&mCusparse));
#endif
}

Manager::~Manager() {
#ifdef CUDACC
    for (auto& it : mStreams) {
        cudaStreamDestroy(it.second);
    }
    cublasDestroy(mCublas);
    // CUSPARSE_CHECK(cusparseDestroy(mCusparse));
#endif
}

void Manager::waitFor(const StreamID& stream) const {
#ifdef CUDACC
    auto it = mStreams.find(stream.mId);
    if (it != mStreams.end()) {
        CUDA_CHECK(cudaStreamSynchronize(it->second));
    } else {
        CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
    }
#endif
}

void Manager::sync() const {
#ifdef CUDACC
    CUDA_CHECK(cudaDeviceSynchronize());
#endif
}

void Manager::addStream(const std::string& name) {
#ifdef CUDACC
    cudaStream_t s;
    CUDA_CHECK(cudaStreamCreate(&s));
    mStreams[name] = s;
#endif
}

#ifdef CUDACC
cudaStream_t Manager::stream(const StreamID& stream) const {
    auto it = mStreams.find(stream.mId);
    if (it != mStreams.end()) {
        return it->second;
    } else {
        CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
    }
}

cublasHandle_t Manager::cublasHandle() const { return mCublas; };
// cusparseHandle_t Manager::cusparseHandle() const { return mCusparse; };

Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
#else
Manager Manager::mManagerInstance = Manager({""});
#endif

////////////////////
// Kernel Methods //
////////////////////

namespace Kernel {

void Settings::setGridDim(const size_t x) {
#ifdef CUDACC
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
    blockGrid.x = x;
    blockGrid.y = 1;
    blockGrid.z = 1;
#endif
}

void Settings::setGridDim(const size_t x, const size_t y) {
#ifdef CUDACC
    CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
    blockGrid.x = x;
    blockGrid.y = y;
    blockGrid.z = 1;
#endif
}

void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
#ifdef CUDACC
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
    CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
    blockGrid.x = x;
    blockGrid.y = y;
    blockGrid.z = z;
#endif
}

void Settings::setBlockDim(const size_t x) {
#ifdef CUDACC
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    threadBlock.x = x;
    threadBlock.y = 1;
    threadBlock.z = 1;
#endif
}

void Settings::setBlockDim(const size_t x, const size_t y) {
#ifdef CUDACC
    CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
    threadBlock.x = x;
    threadBlock.y = y;
    threadBlock.z = 1;
#endif
}

void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
#ifdef CUDACC
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
    CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
    threadBlock.x = x;
    threadBlock.y = y;
    threadBlock.z = z;
#endif
}

void Settings::setSharedMemSize(const size_t bytes) {
#ifdef CUDACC
    sharedMemoryBytes = bytes;
#endif
}

void Settings::setStream(const StreamID& stream_) {
#ifdef CUDACC
    stream = stream_;
#endif
}

Settings basic(const size_t threads, const StreamID& stream) {
    Settings sett;
#ifdef CUDACC
    auto max_threads = DeviceProperties.maxThreadsPerBlock;
    size_t grid_blocks = (threads + max_threads - 1) / max_threads;   // ceil(threads / max_threads)
    size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
    sett.setGridDim(grid_blocks);
    sett.setBlockDim(block_threads);
    sett.setStream(stream);
#else
    sett.threads = threads;
#endif
    return sett;
}

}; // namespace Kernel

/////////////////////
// Shape Functions //
/////////////////////

HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
    CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
    mAxes = dims.size();
    if (mAxes == 0) return;

    auto it = dims.end() - 1;
    mItems = 1;
    for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
        uint32_t dim = *it;
        CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");

        mAxisDim[iAxis] = dim;
        mStride[iAxis] = mItems;
        mItems *= dim;
        --it;
    }

    if (mAxes == 1) return;
    // Swap last two, for column major storage.
    mStride[mAxes - 2] = 1;
    mStride[mAxes - 1] = mAxisDim[mAxes - 2];
}

HD uint32_t Shape::axes() const { return mAxes; };
HD uint32_t Shape::items() const { return mItems; };
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }

HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }

HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }

HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }

HD bool Shape::operator==(const Shape& s) const {
    if (mAxes != s.mAxes) {
        return false;
    }
    for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
        if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
            return false;
        }
    }
    return true;
}

HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }

HD Shape Shape::subshape(const uint32_t axis) const {
    CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
    if (axis == mAxes) return Shape({1});

    Shape new_shape({});
    new_shape.mAxes = mAxes - axis;
    new_shape.mItems = mItems;

    for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
        new_shape.mItems /= mAxisDim[iAxis];
    }
    for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
        new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
        new_shape.mStride[iAxis - axis] = mStride[iAxis];
    }
    return new_shape;
}

std::ostream& operator<<(std::ostream& out, const Shape& s) {
    out << "(";
    if (s.axes() == 0) return out << ")";
    for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
        out << s.dim(iAxis) << ", ";
    }
    return out << s.dim(s.axes() - 1) << ")";
}

///////////////////
// Event Methods //
///////////////////

Event::Event() {
#ifdef CUDACC
    CUDA_CHECK(cudaEventCreate(&mEvent));
#endif
}

Event::~Event() {
#ifdef CUDACC
    cudaEventDestroy(mEvent);
#endif
}

void Event::record(const StreamID& stream) {
#ifdef CUDACC
    CUDA_CHECK(cudaEventRecord(mEvent, Manager::get()->stream(stream)));
#endif
}

//////////////////////////
// GraphManager Methods //
//////////////////////////

GraphManager::~GraphManager() {
#ifdef CUDACC
    for (Event* event : mEvents) {
        delete event;
    }
#endif
}

void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
    Event* event = new Event();
    event->record(orig_stream);
    mEvents.push_back(event);
    branch_stream.wait(*event);
}

void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
    Event* event = new Event();
    event->record(branch_stream);
    mEvents.push_back(event);
    orig_stream.wait(*event);
}

};     // namespace CudaTools
#endif
#endif // CUDATOOLS_IMPLEMENTATION