parent
							
								
									a393ff92d2
								
							
						
					
					
						commit
						31916ed752
					
				
				 15 changed files with 442 additions and 180 deletions
			
			
		| @ -0,0 +1,95 @@ | ||||
| CC := g++-10
 | ||||
| NVCC := nvcc
 | ||||
| CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 | ||||
| NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
 | ||||
| 
 | ||||
| INCLUDE := ../../
 | ||||
| LIBS_DIR :=
 | ||||
| LIBS_DIR_GPU := /usr/local/cuda/lib64
 | ||||
| LIBS :=
 | ||||
| LIBS_GPU := cuda cudart cublas
 | ||||
| 
 | ||||
| TARGET = simpleGraph
 | ||||
| SRC_DIR = .
 | ||||
| BUILD_DIR = build
 | ||||
| 
 | ||||
| # Should not need to modify below.
 | ||||
| 
 | ||||
| CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 | ||||
| GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 | ||||
| 
 | ||||
| SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 | ||||
| 
 | ||||
| # Get source files and object files.
 | ||||
| GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 | ||||
| NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 | ||||
| GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | ||||
| NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 | ||||
| 
 | ||||
| # If compiling for CPU, all go to GCC. Otherwise, they are split.
 | ||||
| CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 | ||||
| GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 | ||||
| GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 | ||||
| 
 | ||||
| # $(info $$GCC_SRC is [${GCC_SRC}])
 | ||||
| # $(info $$NVCC_SRC is [${NVCC_SRC}])
 | ||||
| # $(info $$GCC_OBJ is [${GCC_OBJ}])
 | ||||
| # $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 | ||||
| 
 | ||||
| # $(info $$CPU_OBJ is [${CPU_OBJ}])
 | ||||
| # $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 | ||||
| # $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 | ||||
| 
 | ||||
| HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 | ||||
| CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 | ||||
| GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 | ||||
| 
 | ||||
| INC := $(INCLUDE:%=-I%)
 | ||||
| LIB := $(LIBS_DIR:%=-L%)
 | ||||
| LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 | ||||
| LD := $(LIBS:%=-l%)
 | ||||
| LD_GPU := $(LIBS_GPU:%=-l%)
 | ||||
| 
 | ||||
| # Reminder:
 | ||||
| # $< = first prerequisite
 | ||||
| # $@ = the target which matched the rule
 | ||||
| # $^ = all prerequisites
 | ||||
| 
 | ||||
| .PHONY: all clean | ||||
| 
 | ||||
| all : cpu gpu | ||||
| 
 | ||||
| cpu: $(TARGET)CPU | ||||
| gpu: $(TARGET)GPU | ||||
| 
 | ||||
| $(TARGET)CPU: $(CPU_OBJ) | ||||
| 	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 | ||||
| 
 | ||||
| $(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) | ||||
| 	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 | ||||
| 
 | ||||
| # For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 | ||||
| # regular ones. Then, we link them all together.
 | ||||
| $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) | ||||
| 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 | ||||
| 
 | ||||
| $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) | ||||
| 	$(NVCC) --device-link $^ -o $@
 | ||||
| 
 | ||||
| $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) | ||||
| 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 | ||||
| 
 | ||||
| $(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) | ||||
| 	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 | ||||
| 
 | ||||
| -include $(CPU_DEPS) | ||||
| -include $(GPU_DEPS) | ||||
| 
 | ||||
| $(CPU_BUILD_DIR): | ||||
| 	mkdir -p $@
 | ||||
| 
 | ||||
| $(GPU_BUILD_DIR): | ||||
| 	mkdir -p $@
 | ||||
| 
 | ||||
| clean: | ||||
| 	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
 | ||||
| @ -0,0 +1,106 @@ | ||||
| #define CUDATOOLS_IMPLEMENTATION | ||||
| #include <Array.h> | ||||
| #include <Core.h> | ||||
| #include <chrono> | ||||
| 
 | ||||
| #define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now() | ||||
| 
 | ||||
| #define TIME_END(name)                                                                             \ | ||||
|     auto end_##name = std::chrono::steady_clock::now();                                            \
 | ||||
|     auto time_ms_##name =                                                                          \
 | ||||
|         std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count();  \
 | ||||
|     auto time_mus_##name =                                                                         \
 | ||||
|         std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count();  \
 | ||||
|     if (time_ms_##name == 0) {                                                                     \
 | ||||
|         printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name);                            \
 | ||||
|     } else {                                                                                       \
 | ||||
|         printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name);                             \
 | ||||
|     } | ||||
| 
 | ||||
| #define TIME(call, name)                                                                           \ | ||||
|     TIME_START(name);                                                                              \
 | ||||
|     call;                                                                                          \
 | ||||
|     TIME_END(name); | ||||
| 
 | ||||
| KERNEL(collatz, const CudaTools::Array<uint32_t> arr) { | ||||
|     BASIC_LOOP(arr.shape().length()) { | ||||
|         if (arr[iThread] % 2) { | ||||
|             arr[iThread] = 3 * arr[iThread] + 1; | ||||
|         } else { | ||||
|             arr[iThread] = arr[iThread] >> 1; | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) { | ||||
|     BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; } | ||||
| } | ||||
| 
 | ||||
| KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) { | ||||
|     BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; } | ||||
| } | ||||
| 
 | ||||
| void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) { | ||||
|     auto Aeig = A.atLeast2D().eigenMap(); | ||||
|     Aeig = Aeig.array() + num; | ||||
| } | ||||
| 
 | ||||
| void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) { | ||||
|     A.updateDevice("graphStream").wait(); | ||||
|     B.updateDevice("graphStreamBranch").wait(); | ||||
|     for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) { | ||||
|         CudaTools::Kernel::launch( | ||||
|             collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view()); | ||||
|         CudaTools::Kernel::launch( | ||||
|             plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view()); | ||||
|     } | ||||
| 
 | ||||
|     CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), | ||||
|                               A.view(), B.view()) | ||||
|         .wait(); | ||||
|     A.updateHost("graphStream"); | ||||
|     B.updateHost("graphStream").wait(); | ||||
|     addNum(A.view(), 5); | ||||
| } | ||||
| 
 | ||||
| void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A, | ||||
|              const CudaTools::Array<uint32_t> B) { | ||||
|     A.updateDevice("graphStream"); | ||||
|     gm->makeBranch("graphStream", "graphStreamBranch"); | ||||
|     B.updateDevice("graphStreamBranch"); | ||||
|     for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) { | ||||
|         CudaTools::Kernel::launch( | ||||
|             collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view()); | ||||
|         CudaTools::Kernel::launch( | ||||
|             plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view()); | ||||
|     } | ||||
| 
 | ||||
|     gm->joinBranch("graphStream", "graphStreamBranch"); | ||||
|     CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), | ||||
|                               A.view(), B.view()); | ||||
|     A.updateHost("graphStream"); | ||||
|     B.updateHost("graphStream"); | ||||
|     gm->launchHostFunction("graphStream", addNum, A.view(), 5); | ||||
| } | ||||
| 
 | ||||
| int main() { | ||||
|     CudaTools::Manager::get()->addStream("graphStream"); | ||||
|     CudaTools::Manager::get()->addStream("graphStreamBranch"); | ||||
| 
 | ||||
|     CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50); | ||||
|     CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0); | ||||
| 
 | ||||
|     TIME(doFunc(A.view(), B.view()), ExecuteNoGraph); | ||||
| 
 | ||||
|     std::cout << A.slice({{0, 10}}) << "\n"; | ||||
| 
 | ||||
|     A.setConstant(50); | ||||
|     B.setConstant(0); | ||||
| 
 | ||||
|     CudaTools::GraphManager gm; | ||||
|     CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view()); | ||||
|     TIME(graph.execute().wait(), ExecuteGraph); | ||||
| 
 | ||||
|     std::cout << A.slice({{0, 10}}) << "\n"; | ||||
|     return 0; | ||||
| } | ||||
					Loading…
					
					
				
		Reference in new issue