CudaTools/Core.h

#ifndef CUDATOOLS_H
#define CUDATOOLS_H

#include "Macros.h"
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>

namespace CudaTools {

/**
 * Simple wrapper for the name of a stream. Its purposes is to allow for
 * 'streams' to be passed on host code, and allowing for simple syntax
 * for waiting.
 */
struct StreamID {
  public:
    std::string id;
    StreamID() : id(""){};
    /**
     * The constructor for a StreamID.
     */
    StreamID(const std::string& id_) : id(id_){};
    StreamID(const char* id_) : id(id_){};

    /**
     * Waits for the stream with this stream ID.
     */
    void wait() const;
};

static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
static const StreamID DEF_CUBLAS_STREAM = StreamID{"defaultCublas"};
static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};

/**
 * Allocates memory on the device.
 */
void* malloc(const size_t size);

/**
 * Pins memory on the host.
 */
void pin(void* const pHost, const size_t size);

/**
 * Pushes memory from the device to the host.
 */
StreamID push(void* const pHost, void* const pDevice, const size_t size,
              const StreamID& stream = DEF_MEM_STREAM);
/**
 * Pulls memory from the device back to the host.
 */
StreamID pull(void* const pHost, void* const pDevice, const size_t size,
              const StreamID& stream = DEF_MEM_STREAM);
/**
 * Copies memory on the device to another location on the device.
 */
StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
                    const StreamID& stream = DEF_MEM_STREAM);

/**
 * Frees memory on the device.
 */
void free(void* const pDevice);

#ifdef CUDACC
cudaDeviceProp getDeviceProp();
static cudaDeviceProp DeviceProperties = getDeviceProp();
const char* cublasGetErrorString(cublasStatus_t status);
#endif

/**
 * A class that manages various CUDA Runtime components, such as
 * streams, events, and handles.
 */
class Manager {
  private:
    static Manager mManagerInstance;
    Manager(const std::vector<std::string>& names);
    ~Manager();
#ifdef CUDACC
    std::unordered_map<std::string, cudaStream_t> mStreams;
    cublasHandle_t mCublas;
#endif
  public:
    /**
     * Used to get the global CudaTools::Manager instance.
     */
    static Manager* get() { return &mManagerInstance; };

    void waitFor(const StreamID& stream) const; /**< Waits for the stream provided. */
    void sync() const;                          /**< Waits until all device code has finished. */
    void addStream(const std::string& name);    /**< Creates a stream with the given name. */
#ifdef CUDACC
    cudaStream_t stream(const StreamID& stream) const;
    cublasHandle_t cublasHandle() const;
#endif
};

namespace Kernel {

/**
 * A struct that contains the kernel launch parameters.
 */
struct Settings {
  public:
#ifdef CUDACC
    dim3 blockGrid;
    dim3 threadBlock;
    size_t sharedMemoryBytes = 0;
#else
    size_t threads;
#endif
    StreamID stream;

    Settings() = default;

    void setGridDim(const size_t x);                 /**< Sets the Grid dimensions. */
    void setGridDim(const size_t x, const size_t y); /**< Sets the Grid dimensions. */
    void setGridDim(const size_t x, const size_t y,
                    const size_t z);                  /**< Sets the Grid dimensions. */
    void setBlockDim(const size_t x);                 /**< Sets the Thread Block dimensions. */
    void setBlockDim(const size_t x, const size_t y); /**< Sets the Thread Block dimensions. */
    void setBlockDim(const size_t x, const size_t y,
                     const size_t z); /**< Sets the Thread Block dimensions. */

    void setSharedMemSize(const size_t bytes); /**< Sets the static shared memory size. */
    void setStream(const StreamID& stream);    /**< Sets the stream. */
};

/**
 * Returns a kernel launch parameters based on the number of threads, and optionally
 * a stream. Should only be used for 'embarassingly parallel' situations, or where
 * each thread corresponds some sort of index.
 */
Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);

}; // namespace Kernel

template <typename T> class Array;

/**
 * A class that holds information about an Array.
 */
class Shape {
  private:
    template <typename T> friend class Array;
    uint32_t mAxes;
    uint32_t mItems;
    uint32_t mAxisDim[CUDATOOLS_ARRAY_MAX_AXES] = {0};
    uint32_t mStride[CUDATOOLS_ARRAY_MAX_AXES] = {0};

  public:
    HD Shape() : mAxes(0), mItems(1){};
    /**
     * The constructor for a Shape.
     * \param dims an initializer list of the dimensions.
     */
    HD Shape(const std::initializer_list<uint32_t> dims);

    HD uint32_t axes() const;  /**< Gets the number of axes. */
    HD uint32_t items() const; /**< Gets the total number of items. */

    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
                                   of the last axis. */
    HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
                                   dimension of the second to last axis. */
    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets the
                                   dimension of the second to last axis. */

    HD uint32_t
    dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
    HD uint32_t stride(const uint32_t axis) const; /**< Gets the stride of the specified axis. */

    /**
     * Gets the shape at a specific axis of this shape.
     * \param axis the axis of where the new shape starts.
     */
    HD Shape subshape(const uint32_t axis) const;

    HD bool operator==(const Shape& s) const; /**< Equals operator. */
    HD bool operator!=(const Shape& s) const; /**< Not equals operator. */
};

std::ostream& operator<<(std::ostream& out, const Shape& s);

}; // namespace CudaTools

#ifdef CUDATOOLS_IMPLEMENTATION

namespace CudaTools {

template <typename T, typename... Args>
StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) {
#ifdef CUDA
    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
           Manager::get()->stream(sett.stream.id)>>>(args...);
#else
    func(args...);
#endif
    return sett.stream;
}

////////////////////
// Memory Methods //
////////////////////

void StreamID::wait() const { Manager::get()->waitFor(id); }

void* malloc(const size_t size) {
#ifdef CUDACC
    void* pDevice;
    CUDA_CHECK(cudaMalloc(&pDevice, size));
    return pDevice;
#else
    return nullptr;
#endif
}

void free(void* const pDevice) {
#ifdef CUDACC
    if (pDevice != nullptr) CUDA_CHECK(cudaFree(pDevice));
#endif
}

StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
#ifdef CUDACC
    CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
                               Manager::get()->stream(stream.id)));
#endif
    return stream;
}

StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
#ifdef CUDACC
    CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
                               Manager::get()->stream(stream.id)));
#endif
    return stream;
}

StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
                    const StreamID& stream) {
#ifdef CUDACC
    CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
                               Manager::get()->stream(stream.id)));
#endif
    return stream;
}

void pin(void* const pHost, const size_t size) {
#ifdef CUDACC
    CUDA_CHECK(cudaHostRegister(pHost, size, cudaHostRegisterDefault));
#endif
}

#ifdef CUDACC
cudaDeviceProp getDeviceProp() {
    cudaSetDevice(0);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    return deviceProp;
}
#endif

/////////////////////
// Manager Methods //
/////////////////////

Manager::Manager(const std::vector<std::string>& names) {
#ifdef CUDACC
    for (auto name : names) {
        addStream(name);
    }
    CUBLAS_CHECK(cublasCreate(&mCublas));
#endif
}

Manager::~Manager() {
#ifdef CUDACC
    for (auto& it : mStreams) {
        CUDA_CHECK(cudaStreamDestroy(it.second));
    }
    CUBLAS_CHECK(cublasDestroy(mCublas));
#endif
}

void Manager::waitFor(const StreamID& stream) const {
#ifdef CUDACC
    auto it = mStreams.find(stream.id);
    if (it != mStreams.end()) {
        CUDA_CHECK(cudaStreamSynchronize(it->second));
    } else {
        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
    }
#endif
}

void Manager::sync() const {
#ifdef CUDACC
    CUDA_CHECK(cudaDeviceSynchronize());
#endif
}

void Manager::addStream(const std::string& name) {
#ifdef CUDACC
    cudaStream_t s;
    CUDA_CHECK(cudaStreamCreate(&s));
    mStreams[name] = s;
#endif
}

#ifdef CUDACC
cudaStream_t Manager::stream(const StreamID& stream) const {
    auto it = mStreams.find(stream.id);
    if (it != mStreams.end()) {
        return it->second;
    } else {
        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
    }
}

cublasHandle_t Manager::cublasHandle() const { return mCublas; };

Manager Manager::mManagerInstance = Manager({"defaultMemory", "defaultCublas", "defaultKernel"});
#else
Manager Manager::mManagerInstance = Manager({""});
#endif

////////////////////
// Kernel Methods //
////////////////////

namespace Kernel {

void Settings::setGridDim(const size_t x) {
#ifdef CUDACC
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Total grid size too large")
    blockGrid.x = x;
    blockGrid.y = 1;
    blockGrid.z = 1;
#endif
}

void Settings::setGridDim(const size_t x, const size_t y) {
#ifdef CUDACC
    CT_ERROR_IF(x * y, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
    blockGrid.x = x;
    blockGrid.y = y;
    blockGrid.z = 1;
#endif
}

void Settings::setGridDim(const size_t x, const size_t y, const size_t z) {
#ifdef CUDACC
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxGridSize[0], "Total grid size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxGridSize[0], "Grid dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxGridSize[1], "Grid dimension 'y' too large.");
    CT_ERROR_IF(z, >, DeviceProperties.maxGridSize[2], "Grid dimension 'z' too large.");
    blockGrid.x = x;
    blockGrid.y = y;
    blockGrid.z = z;
#endif
}

void Settings::setBlockDim(const size_t x) {
#ifdef CUDACC
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    threadBlock.x = x;
    threadBlock.y = 1;
    threadBlock.z = 1;
#endif
}

void Settings::setBlockDim(const size_t x, const size_t y) {
#ifdef CUDACC
    CT_ERROR_IF(x * y, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
    threadBlock.x = x;
    threadBlock.y = y;
    threadBlock.z = 1;
#endif
}

void Settings::setBlockDim(const size_t x, const size_t y, const size_t z) {
#ifdef CUDACC
    CT_ERROR_IF(x * y * z, >, DeviceProperties.maxThreadsDim[0], "Total block size too large.");
    CT_ERROR_IF(x, >, DeviceProperties.maxThreadsDim[0], "Block dimension 'x' too large.");
    CT_ERROR_IF(y, >, DeviceProperties.maxThreadsDim[1], "Block dimension 'y' too large.");
    CT_ERROR_IF(z, >, DeviceProperties.maxThreadsDim[2], "Block dimension 'z' too large.");
    threadBlock.x = x;
    threadBlock.y = y;
    threadBlock.z = z;
#endif
}

void Settings::setSharedMemSize(const size_t bytes) {
#ifdef CUDACC
    sharedMemoryBytes = bytes;
#endif
}

void Settings::setStream(const StreamID& stream_) {
#ifdef CUDACC
    stream.id = stream_.id;
#endif
}

Settings basic(const size_t threads, const StreamID& stream) {
    Settings sett;
#ifdef CUDACC
    auto max_threads = DeviceProperties.maxThreadsPerBlock;
    size_t grid_blocks = (threads + max_threads - 1) / max_threads;   // ceil(threads / max_threads)
    size_t block_threads = (threads + grid_blocks - 1) / grid_blocks; // ceil(threads / grid_blocks)
    sett.setGridDim(grid_blocks);
    sett.setBlockDim(block_threads);
    sett.setStream(stream);
#else
    sett.threads = threads;
#endif
    return sett;
}
} // namespace Kernel

/////////////////////
// Shape Functions //
/////////////////////

HD Shape::Shape(const std::initializer_list<uint32_t> dims) : mAxes(dims.size()), mItems(1) {
    CT_ERROR_IF(dims.size(), >, CUDATOOLS_ARRAY_MAX_AXES, "Number of axes exceeds max axes");
    mAxes = dims.size();
    if (mAxes == 0) return;

    auto it = dims.end() - 1;
    mItems = 1;
    for (uint32_t iAxis = mAxes - 1; iAxis < mAxes; --iAxis) {
        uint32_t dim = *it;
        CT_ERROR_IF(dim, ==, 0, "Axis dimension cannot be 0");

        mAxisDim[iAxis] = dim;
        mStride[iAxis] = mItems;
        mItems *= dim;
        --it;
    }

    if (mAxes == 1) return;
    // Swap last two, for column major storage.
    mStride[mAxes - 2] = 1;
    mStride[mAxes - 1] = mAxisDim[mAxes - 2];
}

HD uint32_t Shape::axes() const { return mAxes; };
HD uint32_t Shape::items() const { return mItems; };
HD uint32_t Shape::length() const { return mAxisDim[mAxes - 1]; }

HD uint32_t Shape::rows() const { return mAxisDim[mAxes - 2]; }

HD uint32_t Shape::cols() const { return mAxisDim[mAxes - 1]; }

HD uint32_t Shape::dim(const uint32_t axis) const { return mAxisDim[axis]; }
HD uint32_t Shape::stride(const uint32_t axis) const { return mStride[axis]; }

HD bool Shape::operator==(const Shape& s) const {
    if (mAxes != s.mAxes) {
        return false;
    }
    for (uint32_t iAxis = 0; iAxis < mAxes; ++iAxis) {
        if (mAxisDim[iAxis] != s.mAxisDim[iAxis]) {
            return false;
        }
    }
    return true;
}

HD bool Shape::operator!=(const Shape& s) const { return not(*this == s); }

HD Shape Shape::subshape(const uint32_t axis) const {
    CT_ERROR_IF(axis, >, mAxes, "Axis number exceeds number of axes.");
    if (axis == mAxes) return Shape({1});

    Shape new_shape({});
    new_shape.mAxes = mAxes - axis;
    new_shape.mItems = mItems;

    for (uint32_t iAxis = 0; iAxis < axis; iAxis++) {
        new_shape.mItems /= mAxisDim[iAxis];
    }
    for (uint32_t iAxis = axis; iAxis < mAxes; iAxis++) {
        new_shape.mAxisDim[iAxis - axis] = mAxisDim[iAxis];
        new_shape.mStride[iAxis - axis] = mStride[iAxis];
    }
    return new_shape;
}

std::ostream& operator<<(std::ostream& out, const Shape& s) {
    out << "(";
    if (s.axes() == 0) return out << ")";
    for (uint32_t iAxis = 0; iAxis < s.axes() - 1; ++iAxis) {
        out << s.dim(iAxis) << ", ";
    }
    return out << s.dim(s.axes() - 1) << ")";
}

#ifdef CUDACC
const char* cublasGetErrorString(cublasStatus_t error) {
    switch (error) {
    case CUBLAS_STATUS_SUCCESS:
        return "CUBLAS_STATUS_SUCCESS";

    case CUBLAS_STATUS_NOT_INITIALIZED:
        return "CUBLAS_STATUS_NOT_INITIALIZED";

    case CUBLAS_STATUS_ALLOC_FAILED:
        return "CUBLAS_STATUS_ALLOC_FAILED";

    case CUBLAS_STATUS_INVALID_VALUE:
        return "CUBLAS_STATUS_INVALID_VALUE";

    case CUBLAS_STATUS_ARCH_MISMATCH:
        return "CUBLAS_STATUS_ARCH_MISMATCH";

    case CUBLAS_STATUS_MAPPING_ERROR:
        return "CUBLAS_STATUS_MAPPING_ERROR";

    case CUBLAS_STATUS_EXECUTION_FAILED:
        return "CUBLAS_STATUS_EXECUTION_FAILED";

    case CUBLAS_STATUS_INTERNAL_ERROR:
        return "CUBLAS_STATUS_INTERNAL_ERROR";
    }

    return "<unknown>";
}
#endif

};     // namespace CudaTools
#endif // CUDATOOLS_IMPLEMENTATION

#endif // CUDATOOLS_H