CudaTools/samples/5_SimpleGraph/main.cu.cpp

#define CUDATOOLS_IMPLEMENTATION
#include <Array.h>
#include <Core.h>
#include <chrono>

#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()

#define TIME_END(name)                                                                             \
    auto end_##name = std::chrono::steady_clock::now();                                            \
    auto time_ms_##name =                                                                          \
        std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count();  \
    auto time_mus_##name =                                                                         \
        std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count();  \
    if (time_ms_##name == 0) {                                                                     \
        printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name);                            \
    } else {                                                                                       \
        printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name);                             \
    }

#define TIME(call, name)                                                                           \
    TIME_START(name);                                                                              \
    call;                                                                                          \
    TIME_END(name);

KERNEL(collatz, const CudaTools::Array<uint32_t> arr) {
    BASIC_LOOP(arr.shape().length()) {
        if (arr[iThread] % 2) {
            arr[iThread] = 3 * arr[iThread] + 1;
        } else {
            arr[iThread] = arr[iThread] >> 1;
        }
    }
}

KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) {
    BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
}

KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) {
    BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
}

void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) {
    auto Aeig = A.atLeast2D().eigenMap();
    Aeig = Aeig.array() + num;
}

void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) {
    A.updateDevice("graphStream").wait();
    B.updateDevice("graphStreamBranch").wait();
    for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
        CudaTools::Kernel::launch(
            collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
        CudaTools::Kernel::launch(
            plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
    }

    CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
                              A.view(), B.view())
        .wait();
    A.updateHost("graphStream");
    B.updateHost("graphStream").wait();
    addNum(A.view(), 5);
}

void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
             const CudaTools::Array<uint32_t> B) {
    A.updateDevice("graphStream");
    gm->makeBranch("graphStream", "graphStreamBranch");
    B.updateDevice("graphStreamBranch");
    for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
        CudaTools::Kernel::launch(
            collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
        CudaTools::Kernel::launch(
            plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
    }

    gm->joinBranch("graphStream", "graphStreamBranch");
    CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
                              A.view(), B.view());
    A.updateHost("graphStream");
    B.updateHost("graphStream");
    gm->launchHostFunction("graphStream", addNum, A.view(), 5);
}

int main() {
    CudaTools::Manager::get()->addStream("graphStream");
    CudaTools::Manager::get()->addStream("graphStreamBranch");

    CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
    CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);

    TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);

    std::cout << A.slice({{0, 10}}) << "\n";

    A.setConstant(50);
    B.setConstant(0);

    CudaTools::GraphManager gm;
    CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
    TIME(graph.execute().wait(), ExecuteGraph);

    std::cout << A.slice({{0, 10}}) << "\n";
    return 0;
}