|
|
|
#define CUDATOOLS_IMPLEMENTATION
|
|
|
|
#include <Array.h>
|
|
|
|
#include <Core.h>
|
|
|
|
#include <chrono>
|
|
|
|
|
|
|
|
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
|
|
|
|
|
|
|
|
#define TIME_END(name) \
|
|
|
|
auto end_##name = std::chrono::steady_clock::now(); \
|
|
|
|
auto time_ms_##name = \
|
|
|
|
std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count(); \
|
|
|
|
auto time_mus_##name = \
|
|
|
|
std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count(); \
|
|
|
|
if (time_ms_##name == 0) { \
|
|
|
|
printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \
|
|
|
|
} else { \
|
|
|
|
printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#define TIME(call, name) \
|
|
|
|
TIME_START(name); \
|
|
|
|
call; \
|
|
|
|
TIME_END(name);
|
|
|
|
|
|
|
|
KERNEL(collatz, const CudaTools::Array<uint32_t> arr) {
|
|
|
|
BASIC_LOOP(arr.shape().length()) {
|
|
|
|
if (arr[iThread] % 2) {
|
|
|
|
arr[iThread] = 3 * arr[iThread] + 1;
|
|
|
|
} else {
|
|
|
|
arr[iThread] = arr[iThread] >> 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) {
|
|
|
|
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
|
|
|
|
}
|
|
|
|
|
|
|
|
KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) {
|
|
|
|
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
|
|
|
|
}
|
|
|
|
|
|
|
|
void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) {
|
|
|
|
auto Aeig = A.atLeast2D().eigenMap();
|
|
|
|
Aeig = Aeig.array() + num;
|
|
|
|
}
|
|
|
|
|
|
|
|
void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) {
|
|
|
|
A.updateDevice("graphStream").wait();
|
|
|
|
B.updateDevice("graphStreamBranch").wait();
|
|
|
|
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
|
|
|
CudaTools::Kernel::launch(
|
|
|
|
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
|
|
|
CudaTools::Kernel::launch(
|
|
|
|
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
|
|
|
}
|
|
|
|
|
|
|
|
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
|
|
|
A.view(), B.view())
|
|
|
|
.wait();
|
|
|
|
A.updateHost("graphStream");
|
|
|
|
B.updateHost("graphStream").wait();
|
|
|
|
addNum(A.view(), 5);
|
|
|
|
}
|
|
|
|
|
|
|
|
void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
|
|
|
|
const CudaTools::Array<uint32_t> B) {
|
|
|
|
A.updateDevice("graphStream");
|
|
|
|
gm->makeBranch("graphStream", "graphStreamBranch");
|
|
|
|
B.updateDevice("graphStreamBranch");
|
|
|
|
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
|
|
|
|
CudaTools::Kernel::launch(
|
|
|
|
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
|
|
|
|
CudaTools::Kernel::launch(
|
|
|
|
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
|
|
|
|
}
|
|
|
|
|
|
|
|
gm->joinBranch("graphStream", "graphStreamBranch");
|
|
|
|
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
|
|
|
|
A.view(), B.view());
|
|
|
|
A.updateHost("graphStream");
|
|
|
|
B.updateHost("graphStream");
|
|
|
|
gm->launchHostFunction("graphStream", addNum, A.view(), 5);
|
|
|
|
}
|
|
|
|
|
|
|
|
int main() {
|
|
|
|
CudaTools::Manager::get()->addStream("graphStream");
|
|
|
|
CudaTools::Manager::get()->addStream("graphStreamBranch");
|
|
|
|
|
|
|
|
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
|
|
|
|
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
|
|
|
|
|
|
|
|
// Executes process without graph.
|
|
|
|
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
|
|
|
|
|
|
|
|
std::cout << A.slice({{0, 10}}) << "\n";
|
|
|
|
|
|
|
|
A.setConstant(50);
|
|
|
|
B.setConstant(0);
|
|
|
|
|
|
|
|
// Executes process with graph.
|
|
|
|
CudaTools::GraphManager gm;
|
|
|
|
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
|
|
|
|
TIME(graph.execute().wait(), ExecuteGraph);
|
|
|
|
|
|
|
|
std::cout << A.slice({{0, 10}}) << "\n";
|
|
|
|
return 0;
|
|
|
|
}
|