Compare commits
No commits in common. '31916ed752cc8d70eb0eb302549a1662f3b9426b' and '00a27b66c32e23c65e93c3c270c8f1fb488d0eac' have entirely different histories.
31916ed752
...
00a27b66c3
18 changed files with 172 additions and 679 deletions
@ -1,9 +1,12 @@ |
|||||||
#define CUDATOOLS_IMPLEMENTATION |
#define CUDATOOLS_IMPLEMENTATION |
||||||
#include <Core.h> |
#include <Core.h> |
||||||
|
|
||||||
KERNEL(add, int x, int y) { printf("Kernel: %i\n", x + y); } |
DEFINE_KERNEL(add, int x, int y) { |
||||||
|
printf("Kernel: %i\n", x + y); |
||||||
|
} |
||||||
|
|
||||||
int main() { |
int main() { |
||||||
CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
|
||||||
return 0; |
return 0; |
||||||
} |
} |
||||||
|
|
||||||
|
@ -1,95 +0,0 @@ |
|||||||
CC := g++-10
|
|
||||||
NVCC := nvcc
|
|
||||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
|
||||||
NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
|
|
||||||
|
|
||||||
INCLUDE := ../../
|
|
||||||
LIBS_DIR :=
|
|
||||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
|
||||||
LIBS :=
|
|
||||||
LIBS_GPU := cuda cudart cublas
|
|
||||||
|
|
||||||
TARGET = simpleGraph
|
|
||||||
SRC_DIR = .
|
|
||||||
BUILD_DIR = build
|
|
||||||
|
|
||||||
# Should not need to modify below.
|
|
||||||
|
|
||||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
|
||||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
|
||||||
|
|
||||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
|
||||||
|
|
||||||
# Get source files and object files.
|
|
||||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
|
||||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
|
||||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
|
||||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
|
||||||
|
|
||||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
|
||||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
|
||||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
|
||||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
|
||||||
|
|
||||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
|
||||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
|
||||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
|
||||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
|
||||||
|
|
||||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
|
||||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
|
||||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
|
||||||
|
|
||||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
|
||||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
|
||||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
|
||||||
|
|
||||||
INC := $(INCLUDE:%=-I%)
|
|
||||||
LIB := $(LIBS_DIR:%=-L%)
|
|
||||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
|
||||||
LD := $(LIBS:%=-l%)
|
|
||||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
|
||||||
|
|
||||||
# Reminder:
|
|
||||||
# $< = first prerequisite
|
|
||||||
# $@ = the target which matched the rule
|
|
||||||
# $^ = all prerequisites
|
|
||||||
|
|
||||||
.PHONY: all clean |
|
||||||
|
|
||||||
all : cpu gpu |
|
||||||
|
|
||||||
cpu: $(TARGET)CPU |
|
||||||
gpu: $(TARGET)GPU |
|
||||||
|
|
||||||
$(TARGET)CPU: $(CPU_OBJ) |
|
||||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
|
||||||
|
|
||||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
|
||||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
|
||||||
|
|
||||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
|
||||||
# regular ones. Then, we link them all together.
|
|
||||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
|
||||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
|
||||||
$(NVCC) --device-link $^ -o $@
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
|
||||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
|
||||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
|
||||||
|
|
||||||
-include $(CPU_DEPS) |
|
||||||
-include $(GPU_DEPS) |
|
||||||
|
|
||||||
$(CPU_BUILD_DIR): |
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
$(GPU_BUILD_DIR): |
|
||||||
mkdir -p $@
|
|
||||||
|
|
||||||
clean: |
|
||||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
|
@ -1,106 +0,0 @@ |
|||||||
#define CUDATOOLS_IMPLEMENTATION |
|
||||||
#include <Array.h> |
|
||||||
#include <Core.h> |
|
||||||
#include <chrono> |
|
||||||
|
|
||||||
#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now() |
|
||||||
|
|
||||||
#define TIME_END(name) \ |
|
||||||
auto end_##name = std::chrono::steady_clock::now(); \
|
|
||||||
auto time_ms_##name = \
|
|
||||||
std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count(); \
|
|
||||||
auto time_mus_##name = \
|
|
||||||
std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count(); \
|
|
||||||
if (time_ms_##name == 0) { \
|
|
||||||
printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name); \
|
|
||||||
} else { \
|
|
||||||
printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name); \
|
|
||||||
} |
|
||||||
|
|
||||||
#define TIME(call, name) \ |
|
||||||
TIME_START(name); \
|
|
||||||
call; \
|
|
||||||
TIME_END(name); |
|
||||||
|
|
||||||
KERNEL(collatz, const CudaTools::Array<uint32_t> arr) { |
|
||||||
BASIC_LOOP(arr.shape().length()) { |
|
||||||
if (arr[iThread] % 2) { |
|
||||||
arr[iThread] = 3 * arr[iThread] + 1; |
|
||||||
} else { |
|
||||||
arr[iThread] = arr[iThread] >> 1; |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) { |
|
||||||
BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; } |
|
||||||
} |
|
||||||
|
|
||||||
KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) { |
|
||||||
BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; } |
|
||||||
} |
|
||||||
|
|
||||||
void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) { |
|
||||||
auto Aeig = A.atLeast2D().eigenMap(); |
|
||||||
Aeig = Aeig.array() + num; |
|
||||||
} |
|
||||||
|
|
||||||
void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) { |
|
||||||
A.updateDevice("graphStream").wait(); |
|
||||||
B.updateDevice("graphStreamBranch").wait(); |
|
||||||
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) { |
|
||||||
CudaTools::Kernel::launch( |
|
||||||
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view()); |
|
||||||
CudaTools::Kernel::launch( |
|
||||||
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view()); |
|
||||||
} |
|
||||||
|
|
||||||
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), |
|
||||||
A.view(), B.view()) |
|
||||||
.wait(); |
|
||||||
A.updateHost("graphStream"); |
|
||||||
B.updateHost("graphStream").wait(); |
|
||||||
addNum(A.view(), 5); |
|
||||||
} |
|
||||||
|
|
||||||
void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A, |
|
||||||
const CudaTools::Array<uint32_t> B) { |
|
||||||
A.updateDevice("graphStream"); |
|
||||||
gm->makeBranch("graphStream", "graphStreamBranch"); |
|
||||||
B.updateDevice("graphStreamBranch"); |
|
||||||
for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) { |
|
||||||
CudaTools::Kernel::launch( |
|
||||||
collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view()); |
|
||||||
CudaTools::Kernel::launch( |
|
||||||
plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view()); |
|
||||||
} |
|
||||||
|
|
||||||
gm->joinBranch("graphStream", "graphStreamBranch"); |
|
||||||
CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), |
|
||||||
A.view(), B.view()); |
|
||||||
A.updateHost("graphStream"); |
|
||||||
B.updateHost("graphStream"); |
|
||||||
gm->launchHostFunction("graphStream", addNum, A.view(), 5); |
|
||||||
} |
|
||||||
|
|
||||||
int main() { |
|
||||||
CudaTools::Manager::get()->addStream("graphStream"); |
|
||||||
CudaTools::Manager::get()->addStream("graphStreamBranch"); |
|
||||||
|
|
||||||
CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50); |
|
||||||
CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0); |
|
||||||
|
|
||||||
TIME(doFunc(A.view(), B.view()), ExecuteNoGraph); |
|
||||||
|
|
||||||
std::cout << A.slice({{0, 10}}) << "\n"; |
|
||||||
|
|
||||||
A.setConstant(50); |
|
||||||
B.setConstant(0); |
|
||||||
|
|
||||||
CudaTools::GraphManager gm; |
|
||||||
CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view()); |
|
||||||
TIME(graph.execute().wait(), ExecuteGraph); |
|
||||||
|
|
||||||
std::cout << A.slice({{0, 10}}) << "\n"; |
|
||||||
return 0; |
|
||||||
} |
|
Loading…
Reference in new issue