parent
359909318b
commit
39ad7c0955
6 changed files with 382 additions and 6 deletions
@ -0,0 +1,95 @@ |
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
|
||||
TARGET = arrayKernel
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
|
||||
# Should not need to modify below.
|
||||
|
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean |
||||
|
||||
all : cpu gpu |
||||
|
||||
cpu: $(TARGET)CPU |
||||
gpu: $(TARGET)GPU |
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ) |
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(NVCC) --device-link $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS) |
||||
-include $(GPU_DEPS) |
||||
|
||||
$(CPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
clean: |
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,34 @@ |
||||
#define CUDATOOLS_IMPLEMENTATION |
||||
#include <Core.h> |
||||
#include <Array.h> |
||||
|
||||
DEFINE_KERNEL(times2, const CudaTools::Array<int>& arr) { |
||||
BASIC_LOOP(arr.shape().items()) { |
||||
arr[iThread] *= 2; |
||||
} |
||||
} |
||||
|
||||
int main() { |
||||
CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(0, 10); |
||||
CudaTools::Array<int> arrConst = CudaTools::Array<int>::constant(1); |
||||
CudaTools::Array<double> arrLinspace = CudaTools::Array<int>::linspace(0, 5, 10); |
||||
CudaTools::Array<int> arrComma({2, 2}); // 2x2 array.
|
||||
arrComma << 1, 2, 3, 4; // Comma initializer if needed.
|
||||
std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; |
||||
|
||||
// Call the kernel multiple times asynchronously. Note: since they share same
|
||||
// stream, they are not run in parallel, just queued on the device.
|
||||
KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange); |
||||
KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrRange); |
||||
KERNEL(times2, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrRange).wait(); |
||||
KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrRange).wait(); |
||||
arrRange.updateHost(); |
||||
arrConst.updateHost(); |
||||
arrLinspace.updateHost(); |
||||
arrComma.updateHost().wait(); // Only need to wait for the last one, since they have the same stream.
|
||||
|
||||
std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; |
||||
return 0; |
||||
} |
||||
|
||||
|
@ -0,0 +1,118 @@ |
||||
CC := g++-10
|
||||
NVCC := nvcc
|
||||
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
|
||||
NVCC_FLAGS := -MMD -w -Xcompiler
|
||||
|
||||
INCLUDE := ../../
|
||||
LIBS_DIR :=
|
||||
LIBS_DIR_GPU := /usr/local/cuda/lib64
|
||||
LIBS :=
|
||||
LIBS_GPU := cuda cudart cublas
|
||||
|
||||
TARGET = arrayFunctions
|
||||
SRC_DIR = .
|
||||
BUILD_DIR = build
|
||||
|
||||
# Should not need to modify below.
|
||||
int main() { |
||||
CudaTools::Array<int> arr = CudaTools::Array<int>::constant(0);
|
||||
arr.reshape({4, 5, 5}); // Creates a three dimensional array.
|
||||
|
||||
arr[0][0][0] = 1; // Axis by axis indexing.
|
||||
arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing.
|
||||
std::cout << arr << "\n";
|
||||
|
||||
CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(18);
|
||||
auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center.
|
||||
std::cout << "Before Copy:\n" << arrSlice << "\n";
|
||||
arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!)
|
||||
std::cout << "After Copy:\n" << arrSlice << "\n";
|
||||
|
||||
std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy.
|
||||
|
||||
CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array.
|
||||
for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array.
|
||||
*it = 1;
|
||||
}
|
||||
std::cout << "Modified New Array:\n" << newArr << "\n";
|
||||
std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy.
|
||||
return 0;
|
||||
} |
||||
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
|
||||
GPU_BUILD_DIR = $(BUILD_DIR)/gpu
|
||||
|
||||
SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
|
||||
|
||||
# Get source files and object files.
|
||||
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
|
||||
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
|
||||
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
|
||||
|
||||
# If compiling for CPU, all go to GCC. Otherwise, they are split.
|
||||
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
|
||||
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
|
||||
|
||||
# $(info $$GCC_SRC is [${GCC_SRC}])
|
||||
# $(info $$NVCC_SRC is [${NVCC_SRC}])
|
||||
# $(info $$GCC_OBJ is [${GCC_OBJ}])
|
||||
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
|
||||
|
||||
# $(info $$CPU_OBJ is [${CPU_OBJ}])
|
||||
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
|
||||
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
|
||||
|
||||
HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
|
||||
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
|
||||
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
|
||||
|
||||
INC := $(INCLUDE:%=-I%)
|
||||
LIB := $(LIBS_DIR:%=-L%)
|
||||
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
|
||||
LD := $(LIBS:%=-l%)
|
||||
LD_GPU := $(LIBS_GPU:%=-l%)
|
||||
|
||||
# Reminder:
|
||||
# $< = first prerequisite
|
||||
# $@ = the target which matched the rule
|
||||
# $^ = all prerequisites
|
||||
|
||||
.PHONY: all clean |
||||
|
||||
all : cpu gpu |
||||
|
||||
cpu: $(TARGET)CPU |
||||
gpu: $(TARGET)GPU |
||||
|
||||
$(TARGET)CPU: $(CPU_OBJ) |
||||
$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
|
||||
|
||||
$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -c -o $@ $< $(INC)
|
||||
|
||||
# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
|
||||
# regular ones. Then, we link them all together.
|
||||
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
|
||||
|
||||
$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) |
||||
$(NVCC) --device-link $^ -o $@
|
||||
|
||||
$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) |
||||
$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
|
||||
|
||||
$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) |
||||
$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
|
||||
|
||||
-include $(CPU_DEPS) |
||||
-include $(GPU_DEPS) |
||||
|
||||
$(CPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
$(GPU_BUILD_DIR): |
||||
mkdir -p $@
|
||||
|
||||
clean: |
||||
rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
|
@ -0,0 +1,30 @@ |
||||
#define CUDATOOLS_IMPLEMENTATION |
||||
#include <Core.h> |
||||
#include <Array.h> |
||||
|
||||
int main() { |
||||
CudaTools::Array<int> arr = CudaTools::Array<int>::constant(0); |
||||
arr.reshape({4, 5, 5}); // Creates a three dimensional array.
|
||||
|
||||
arr[0][0][0] = 1; // Axis by axis indexing.
|
||||
arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing.
|
||||
std::cout << arr << "\n"; |
||||
|
||||
CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(18); |
||||
auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center.
|
||||
std::cout << "Before Copy:\n" << arrSlice << "\n"; |
||||
arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!)
|
||||
std::cout << "After Copy:\n" << arrSlice << "\n"; |
||||
|
||||
std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy.
|
||||
|
||||
CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array.
|
||||
for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array.
|
||||
*it = 1; |
||||
} |
||||
std::cout << "Modified New Array:\n" << newArr << "\n"; |
||||
std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy.
|
||||
return 0; |
||||
} |
||||
|
||||
|
Loading…
Reference in new issue