diff --git a/docs/source/conf.py b/docs/source/conf.py index afce172..d28509a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -2,9 +2,9 @@ # -- Project information -project = 'DGEMS' -copyright = '2022' -author = 'Kenneth Jao, Qi Jian Lim' +project = 'CudaTools' +copyright = '2023' +author = 'Kenneth Jao' release = '0.1' version = '0.1.0' diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 60807a6..13cb6cc 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -55,7 +55,7 @@ see :ref:`here `. and make it available to other files. Since many applications used classes, a macro is provided to 'convert' a class into -being device-compatible. Following the previous example similarly, +being device-compatible. We follow the previous example in a similar fashion. .. code-block:: cpp @@ -69,6 +69,8 @@ being device-compatible. Following the previous example similarly, updateDevice().wait(); // Copies the memory on the host to the device and waits until finished. }; + ~intPair() { CudaTools::free(that()); }; + HD void swap() { int swap = x; x = y; @@ -91,8 +93,16 @@ being device-compatible. Following the previous example similarly, In this example, we create a class called ``intPair``, which is then made available on the device through the ``DEVICE_CLASS(name)`` macro. Specifically, that macro introduces a few functions, like -``allocateDevice()``, ``updateDevice()``, ``updateHost()``, and ``that()``. That last function -returns a pointer to the copy on the device. For more details, see :ref:`here `. If we were to pass in the host pointer of the ``intPair`` to the kernel, there would be a illegal memory access. +``allocateDevice()``, ``updateDevice()``, ``updateHost()``, and ``that()``. The ``that()`` function +returns a pointer to the copy on the device. As a result, the programmer **must** define a destructor +that frees the pointer using ``CudaTools::free(that)``. For more details, see :ref:`here `. + +.. warning:: + The ``updateDevice()`` and ``updateHost()`` in most cases will need to be explicitly called + to push the data on the host to the device, and vice-versa. It is the programmers job to maintain + where the 'most recent' copy is. If these are not called, various memory errors can occur. Note that, + when passing a pointer to the kernel, it must be the *device* pointer. Otherwise, an illegal memory + access would occur. The kernel argument list should **must** consist of pointers to objects, or a non-reference object. Otherwise, compilation will fail. In general this is safer, as it forces the programmer to @@ -118,6 +128,95 @@ compiled for CPU, then everything will run synchronously, as per usual. Array Examples ============== +This file introduces the ``Array`` class, which is a class that provides automatic +memory management between device and host. In particular, it provides functionality on +both the host and device while handling proper memory destruction, with many nice +features. In particular it supports mimics many features of the Python package NumPy.` +We can demonstrate a few here. + +.. code-block:: cpp + + DEFINE_KERNEL(times2, const CudaTools::Array& arr) { + BASIC_LOOP(arr.shape().items()) { + arr[iThread] *= 2; + } + } + + int main() { + CudaTools::Array arrRange = CudaTools::Array::range(0, 10); + CudaTools::Array arrConst = CudaTools::Array::constant(1); + CudaTools::Array arrLinspace = CudaTools::Array::linspace(0, 5, 10); + CudaTools::Array arrComma({2, 2}); // 2x2 array. + arrComma << 1, 2, 3, 4; // Comma initializer if needed. + std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; + + // Call the kernel multiple times asynchronously. Note: since they share same + // stream, they are not run in parallel, just queued on the device. + KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange); + KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrRange); + KERNEL(times2, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrRange).wait(); + KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrRange).wait(); + arrRange.updateHost(); + arrConst.updateHost(); + arrLinspace.updateHost(); + arrComma.updateHost().wait(); // Only need to wait for the last one, since they have the same stream. + + std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; + return 0; + } + +In this example, we show a few ways to initialize an ``Array`` through some static functions. +It is templated, so it can (theoretically) support any type. Additionally, you can initialize an +empty ``Array`` by providing its ``Shape`` with an initializer list (ex: ``{2, 2}``). For more details, +see :ref:`here >`. + +We also note the use of ``BASIC_LOOP(N)``, which is a macro for generating the loop automatically +on the kernel given the number of threads. It is intended to be used only for "embarassingly parallel" +situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If compiling for CPU, it will +mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism. + +The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and +a few other functions. + +.. code-block:: cpp + + int main() { + CudaTools::Array arr = CudaTools::Array::constant(0); + arr.reshape({4, 5, 5}); // Creates a three dimensional array. + + arr[0][0][0] = 1; // Axis by axis indexing. + arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing. + std::cout << arr << "\n"; + + CudaTools::Array arrRange = CudaTools::Array::range(18); + auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center. + std::cout << "Before Copy:\n" << arrSlice << "\n"; + arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!) + std::cout << "After Copy:\n" << arrSlice << "\n"; + + std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy. + + CudaTools::Array newArr = arr.copy(); // Copies the original Array. + for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array. + *it = 1; + } + std::cout << "Modified New Array:\n" << newArr << "\n"; + std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy. + return 0; + } + +In this example, we demonstrate some of the functionality of the Array. We can do +multi-dimensional indexing, take slices of the Array, and iterate through the Array through an +iterator, in C++ fashion. Particularly, we need to introduce the concept of a "view" of an Array. +An Array either "owns" its data or is a "view" of another Array. You can create a +view manually with the ``.view()`` function. + +.. warning:: + When using the assignment operator, if a view is on the left-hand side, it will + perform a copy of the internal data. However, if the Array is an owner, then it will replace + the entire Array, and **free the old memory**. This means any view of that previous + array will now point to invalid places in memory. It is responsibility of the + programmer to manage this. BLAS Examples diff --git a/samples/3_ArrayKernel/Makefile b/samples/3_ArrayKernel/Makefile new file mode 100644 index 0000000..633490a --- /dev/null +++ b/samples/3_ArrayKernel/Makefile @@ -0,0 +1,95 @@ +CC := g++-10 +NVCC := nvcc +CFLAGS := -Wall -std=c++17 -fopenmp -MMD +NVCC_FLAGS := -MMD -w -Xcompiler + +INCLUDE := ../../ +LIBS_DIR := +LIBS_DIR_GPU := /usr/local/cuda/lib64 +LIBS := +LIBS_GPU := cuda cudart cublas + +TARGET = arrayKernel +SRC_DIR = . +BUILD_DIR = build + +# Should not need to modify below. + +CPU_BUILD_DIR = $(BUILD_DIR)/cpu +GPU_BUILD_DIR = $(BUILD_DIR)/gpu + +SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp) + +# Get source files and object files. +GCC_SRC = $(filter-out %.cu.cpp ,$(SRC)) +NVCC_SRC = $(filter %.cu.cpp, $(SRC)) +GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o) +NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o) + +# If compiling for CPU, all go to GCC. Otherwise, they are split. +CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ)) +GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ)) +GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ)) + +# $(info $$GCC_SRC is [${GCC_SRC}]) +# $(info $$NVCC_SRC is [${NVCC_SRC}]) +# $(info $$GCC_OBJ is [${GCC_OBJ}]) +# $(info $$NVCC_OBJ is [${NVCC_OBJ}]) + +# $(info $$CPU_OBJ is [${CPU_OBJ}]) +# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}]) +# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}]) + +HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h) +CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d) +GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d) + +INC := $(INCLUDE:%=-I%) +LIB := $(LIBS_DIR:%=-L%) +LIB_GPU := $(LIBS_DIR_GPU:%=-L%) +LD := $(LIBS:%=-l%) +LD_GPU := $(LIBS_GPU:%=-l%) + +# Reminder: +# $< = first prerequisite +# $@ = the target which matched the rule +# $^ = all prerequisites + +.PHONY: all clean + +all : cpu gpu + +cpu: $(TARGET)CPU +gpu: $(TARGET)GPU + +$(TARGET)CPU: $(CPU_OBJ) + $(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS) + +$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) + $(CC) $(CFLAGS) -c -o $@ $< $(INC) + +# For GPU, we need to build the NVCC objects, the NVCC linked object, and the +# regular ones. Then, we link them all together. +$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) + $(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU) + +$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) + $(NVCC) --device-link $^ -o $@ + +$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) + $(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC) + +$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) + $(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC) + +-include $(CPU_DEPS) +-include $(GPU_DEPS) + +$(CPU_BUILD_DIR): + mkdir -p $@ + +$(GPU_BUILD_DIR): + mkdir -p $@ + +clean: + rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU diff --git a/samples/3_ArrayKernel/main.cu.cpp b/samples/3_ArrayKernel/main.cu.cpp new file mode 100644 index 0000000..adc4ed0 --- /dev/null +++ b/samples/3_ArrayKernel/main.cu.cpp @@ -0,0 +1,34 @@ +#define CUDATOOLS_IMPLEMENTATION +#include +#include + +DEFINE_KERNEL(times2, const CudaTools::Array& arr) { + BASIC_LOOP(arr.shape().items()) { + arr[iThread] *= 2; + } +} + +int main() { + CudaTools::Array arrRange = CudaTools::Array::range(0, 10); + CudaTools::Array arrConst = CudaTools::Array::constant(1); + CudaTools::Array arrLinspace = CudaTools::Array::linspace(0, 5, 10); + CudaTools::Array arrComma({2, 2}); // 2x2 array. + arrComma << 1, 2, 3, 4; // Comma initializer if needed. + std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; + + // Call the kernel multiple times asynchronously. Note: since they share same + // stream, they are not run in parallel, just queued on the device. + KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange); + KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrRange); + KERNEL(times2, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrRange).wait(); + KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrRange).wait(); + arrRange.updateHost(); + arrConst.updateHost(); + arrLinspace.updateHost(); + arrComma.updateHost().wait(); // Only need to wait for the last one, since they have the same stream. + + std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; + return 0; +} + + diff --git a/samples/4_ArrayFunctions/Makefile b/samples/4_ArrayFunctions/Makefile new file mode 100644 index 0000000..d0486ce --- /dev/null +++ b/samples/4_ArrayFunctions/Makefile @@ -0,0 +1,118 @@ +CC := g++-10 +NVCC := nvcc +CFLAGS := -Wall -std=c++17 -fopenmp -MMD +NVCC_FLAGS := -MMD -w -Xcompiler + +INCLUDE := ../../ +LIBS_DIR := +LIBS_DIR_GPU := /usr/local/cuda/lib64 +LIBS := +LIBS_GPU := cuda cudart cublas + +TARGET = arrayFunctions +SRC_DIR = . +BUILD_DIR = build + +# Should not need to modify below. +int main() { + CudaTools::Array arr = CudaTools::Array::constant(0); + arr.reshape({4, 5, 5}); // Creates a three dimensional array. + + arr[0][0][0] = 1; // Axis by axis indexing. + arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing. + std::cout << arr << "\n"; + + CudaTools::Array arrRange = CudaTools::Array::range(18); + auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center. + std::cout << "Before Copy:\n" << arrSlice << "\n"; + arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!) + std::cout << "After Copy:\n" << arrSlice << "\n"; + + std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy. + + CudaTools::Array newArr = arr.copy(); // Copies the original Array. + for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array. + *it = 1; + } + std::cout << "Modified New Array:\n" << newArr << "\n"; + std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy. + return 0; +} +CPU_BUILD_DIR = $(BUILD_DIR)/cpu +GPU_BUILD_DIR = $(BUILD_DIR)/gpu + +SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp) + +# Get source files and object files. +GCC_SRC = $(filter-out %.cu.cpp ,$(SRC)) +NVCC_SRC = $(filter %.cu.cpp, $(SRC)) +GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o) +NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o) + +# If compiling for CPU, all go to GCC. Otherwise, they are split. +CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ)) +GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ)) +GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ)) + +# $(info $$GCC_SRC is [${GCC_SRC}]) +# $(info $$NVCC_SRC is [${NVCC_SRC}]) +# $(info $$GCC_OBJ is [${GCC_OBJ}]) +# $(info $$NVCC_OBJ is [${NVCC_OBJ}]) + +# $(info $$CPU_OBJ is [${CPU_OBJ}]) +# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}]) +# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}]) + +HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h) +CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d) +GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d) + +INC := $(INCLUDE:%=-I%) +LIB := $(LIBS_DIR:%=-L%) +LIB_GPU := $(LIBS_DIR_GPU:%=-L%) +LD := $(LIBS:%=-l%) +LD_GPU := $(LIBS_GPU:%=-l%) + +# Reminder: +# $< = first prerequisite +# $@ = the target which matched the rule +# $^ = all prerequisites + +.PHONY: all clean + +all : cpu gpu + +cpu: $(TARGET)CPU +gpu: $(TARGET)GPU + +$(TARGET)CPU: $(CPU_OBJ) + $(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS) + +$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR) + $(CC) $(CFLAGS) -c -o $@ $< $(INC) + +# For GPU, we need to build the NVCC objects, the NVCC linked object, and the +# regular ones. Then, we link them all together. +$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR) + $(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU) + +$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR) + $(NVCC) --device-link $^ -o $@ + +$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR) + $(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC) + +$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR) + $(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC) + +-include $(CPU_DEPS) +-include $(GPU_DEPS) + +$(CPU_BUILD_DIR): + mkdir -p $@ + +$(GPU_BUILD_DIR): + mkdir -p $@ + +clean: + rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU diff --git a/samples/4_ArrayFunctions/main.cu.cpp b/samples/4_ArrayFunctions/main.cu.cpp new file mode 100644 index 0000000..3979f3c --- /dev/null +++ b/samples/4_ArrayFunctions/main.cu.cpp @@ -0,0 +1,30 @@ +#define CUDATOOLS_IMPLEMENTATION +#include +#include + +int main() { + CudaTools::Array arr = CudaTools::Array::constant(0); + arr.reshape({4, 5, 5}); // Creates a three dimensional array. + + arr[0][0][0] = 1; // Axis by axis indexing. + arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing. + std::cout << arr << "\n"; + + CudaTools::Array arrRange = CudaTools::Array::range(18); + auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center. + std::cout << "Before Copy:\n" << arrSlice << "\n"; + arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!) + std::cout << "After Copy:\n" << arrSlice << "\n"; + + std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy. + + CudaTools::Array newArr = arr.copy(); // Copies the original Array. + for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array. + *it = 1; + } + std::cout << "Modified New Array:\n" << newArr << "\n"; + std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy. + return 0; +} + +