#define CUDATOOLS_IMPLEMENTATION #include #include #include #define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now() #define TIME_END(name) \ auto end_##name = std::chrono::steady_clock::now(); \ auto time_ms_##name = \ std::chrono::duration_cast(end_##name - begin_##name).count(); \ auto time_mus_##name = \ std::chrono::duration_cast(end_##name - begin_##name).count(); \ if (time_ms_##name == 0) { \ printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \ } else { \ printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \ } #define TIME(call, name) \ TIME_START(name); \ call; \ TIME_END(name); KERNEL(collatz, const CudaTools::Array arr) { BASIC_LOOP(arr.shape().length()) { if (arr[iThread] % 2) { arr[iThread] = 3 * arr[iThread] + 1; } else { arr[iThread] = arr[iThread] >> 1; } } } KERNEL(plusOne, const CudaTools::Array arr) { BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; } } KERNEL(addArray, const CudaTools::Array a, const CudaTools::Array b) { BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; } } void addNum(const CudaTools::Array A, uint32_t num) { auto Aeig = A.atLeast2D().eigenMap(); Aeig = Aeig.array() + num; } void doFunc(const CudaTools::Array A, const CudaTools::Array B) { A.updateDevice("graphStream").wait(); B.updateDevice("graphStreamBranch").wait(); for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) { CudaTools::Kernel::launch( collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view()); CudaTools::Kernel::launch( plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view()); } CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view(), B.view()) .wait(); A.updateHost("graphStream"); B.updateHost("graphStream").wait(); addNum(A.view(), 5); } void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array A, const CudaTools::Array B) { A.updateDevice("graphStream"); gm->makeBranch("graphStream", "graphStreamBranch"); B.updateDevice("graphStreamBranch"); for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) { CudaTools::Kernel::launch( collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view()); CudaTools::Kernel::launch( plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view()); } gm->joinBranch("graphStream", "graphStreamBranch"); CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view(), B.view()); A.updateHost("graphStream"); B.updateHost("graphStream"); gm->launchHostFunction("graphStream", addNum, A.view(), 5); } int main() { CudaTools::Manager::get()->addStream("graphStream"); CudaTools::Manager::get()->addStream("graphStreamBranch"); CudaTools::Array A = CudaTools::Array::constant({100}, 50); CudaTools::Array B = CudaTools::Array::constant({100}, 0); // Executes process without graph. TIME(doFunc(A.view(), B.view()), ExecuteNoGraph); std::cout << A.slice({{0, 10}}) << "\n"; A.setConstant(50); B.setConstant(0); // Executes process with graph. CudaTools::GraphManager gm; CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view()); TIME(graph.execute().wait(), ExecuteGraph); std::cout << A.slice({{0, 10}}) << "\n"; return 0; }