A library and framework for developing CPU-CUDA compatible applications under one unified code.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

106 lines
4.4 KiB

#define CUDATOOLS_IMPLEMENTATION
#include <Array.h>
#include <Core.h>
#include <chrono>
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
#define TIME_END(name) \
auto end_##name = std::chrono::steady_clock::now(); \
auto time_ms_##name = \
std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count(); \
auto time_mus_##name = \
std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count(); \
if (time_ms_##name == 0) { \
printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \
} else { \
printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \
}
#define TIME(call, name) \
TIME_START(name); \
call; \
TIME_END(name);
KERNEL(collatz, const CudaTools::Array<uint32_t> arr) {
BASIC_LOOP(arr.shape().length()) {
if (arr[iThread] % 2) {
arr[iThread] = 3 * arr[iThread] + 1;
} else {
arr[iThread] = arr[iThread] >> 1;
}
}
}
KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) {
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
}
KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) {
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
}
void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) {
auto Aeig = A.atLeast2D().eigenMap();
Aeig = Aeig.array() + num;
}
void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) {
A.updateDevice("graphStream").wait();
B.updateDevice("graphStreamBranch").wait();
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
CudaTools::Kernel::launch(
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
CudaTools::Kernel::launch(
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
}
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
A.view(), B.view())
.wait();
A.updateHost("graphStream");
B.updateHost("graphStream").wait();
addNum(A.view(), 5);
}
void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
const CudaTools::Array<uint32_t> B) {
A.updateDevice("graphStream");
gm->makeBranch("graphStream", "graphStreamBranch");
B.updateDevice("graphStreamBranch");
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
CudaTools::Kernel::launch(
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
CudaTools::Kernel::launch(
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
}
gm->joinBranch("graphStream", "graphStreamBranch");
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
A.view(), B.view());
A.updateHost("graphStream");
B.updateHost("graphStream");
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
}
int main() {
CudaTools::Manager::get()->addStream("graphStream");
CudaTools::Manager::get()->addStream("graphStreamBranch");
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
std::cout << A.slice({{0, 10}}) << "\n";
A.setConstant(50);
B.setConstant(0);
CudaTools::GraphManager gm;
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
TIME(graph.execute().wait(), ExecuteGraph);
std::cout << A.slice({{0, 10}}) << "\n";
return 0;
}