From 31916ed752cc8d70eb0eb302549a1662f3b9426b Mon Sep 17 00:00:00 2001
From: Kenneth Jao <ksjao2@illinois.edu>
Date: Fri, 26 May 2023 03:01:52 -0500
Subject: [PATCH] Updated documentation samples with modifications

---
 Array.h                           |  12 ++-
 Core.h                            | 144 +++++++++++++-----------------
 Macros.h                          |  20 +----
 docs/source/core.rst              |  48 +++++++---
 docs/source/usage.rst             |  94 ++++++++++++++++---
 samples/1_CoreKernel/Makefile     |   2 +-
 samples/1_CoreKernel/main.cu.cpp  |   7 +-
 samples/2_CoreClass/Makefile      |   2 +-
 samples/2_CoreClass/main.cu.cpp   |  36 ++++----
 samples/3_ArrayKernel/Makefile    |   2 +-
 samples/3_ArrayKernel/main.cu.cpp |  17 ++--
 samples/4_ArrayFunctions/Makefile |   2 +-
 samples/5_SimpleGraph/Makefile    |  95 ++++++++++++++++++++
 samples/5_SimpleGraph/main.cu.cpp | 106 ++++++++++++++++++++++
 tests.cu.cpp                      |  35 ++++----
 15 files changed, 442 insertions(+), 180 deletions(-)
 create mode 100644 samples/5_SimpleGraph/Makefile
 create mode 100644 samples/5_SimpleGraph/main.cu.cpp
diff --git a/Array.h b/Array.h
index 3386d2f..4a98369 100644
--- a/Array.h
+++ b/Array.h
@@ -556,7 +556,7 @@ template <typename T> class Array {
         }
 #ifndef DEVICE
         if (pDevice != nullptr) {
-            CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
+            CudaTools::copy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
         }
 #endif
         return arr;
@@ -739,9 +739,8 @@ template <typename T> class Array {
      * \brief Host only
      */
     StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
-        CT_ERROR(mIsView, "Cannot update host on a view");
-        CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream);
-        return stream;
+        CT_ERROR(mIsSlice, "Cannot update host copy on a slice");
+        return CudaTools::copy(pDevice, pHost, mShape.items() * sizeof(T), stream);
     };
 
     /**
@@ -749,9 +748,8 @@ template <typename T> class Array {
      * \brief Host only
      */
     StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
-        CT_ERROR(mIsView, "Cannot update device on a view");
-        CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream);
-        return stream;
+        CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
+        return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
     };
 };
 
diff --git a/Core.h b/Core.h
index 76bca0e..82df51b 100644
--- a/Core.h
+++ b/Core.h
@@ -44,30 +44,24 @@ static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
 void* malloc(const size_t size);
 
 /**
- * Pins memory on the host.
+ * Frees memory on the device.
  */
-void pin(void* const pHost, const size_t size);
+void free(void* const pDevice);
 
 /**
- * Pushes memory from the device to the host.
- */
-StreamID push(void* const pHost, void* const pDevice, const size_t size,
-              const StreamID& stream = DEF_MEM_STREAM);
-/**
- * Pulls memory from the device back to the host.
+ * Copies memory from the source pointer to the dest pointer.
  */
-StreamID pull(void* const pHost, void* const pDevice, const size_t size,
+StreamID copy(void* const source, void* const dest, const size_t size,
               const StreamID& stream = DEF_MEM_STREAM);
 /**
- * Copies memory on the device to another location on the device.
+ * Initializes or sets device memory to a value.
  */
-StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
-                    const StreamID& stream = DEF_MEM_STREAM);
-
+StreamID memset(void* const pDevice, int value, const size_t size,
+                const StreamID& stream = DEF_MEM_STREAM);
 /**
- * Frees memory on the device.
+ * Pins memory on the host.
  */
-void free(void* const pDevice);
+void pin(void* const pHost, const size_t size);
 
 #ifdef CUDACC
 cudaDeviceProp getDeviceProp();
@@ -181,12 +175,12 @@ class Shape {
     HD uint32_t axes() const;  /**< Gets the number of axes. */
     HD uint32_t items() const; /**< Gets the total number of items. */
 
-    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
-                                   of the last axis. */
+    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
+                                   dimension of the last axis. */
     HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
                                    dimension of the second to last axis. */
-    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets the
-                                   dimension of the second to last axis. */
+    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets
+                                   the dimension of the second to last axis. */
 
     HD uint32_t
     dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
@@ -227,46 +221,6 @@ template <typename F, typename... Args> struct FuncHolder {
     };
 };
 
-/**
- * Accessory struct to deal with host callbacks for CUDA Graphs in a nice fashion.
- */
-struct GraphTools {
-    std::vector<void*> mHostData;
-    std::vector<Event*> mEvents;
-
-    ~GraphTools();
-
-    /**
-     * Within a function that is being stream captured, launch a host function that can
-     * be captured into the graph.
-     */
-
-    template <typename F, typename... Args>
-    void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
-#ifdef CUDACC
-        FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
-        mHostData.push_back((void*)fh);
-        cudaHostFn_t run_func = fh->run;
-        CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
-#else
-        func(args...);
-#endif
-    }
-
-    /**
-     * Makes a new branch in the graph to be run in parallel by a new stream.
-     * \param orig_stream the original stream to branch from.
-     * \param branch_stream the stream of the new branch.
-     */
-    void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
-    /**
-     * Joins a existing branch in the graph to collapse a parallel block.
-     * \param orig_stream the original stream to join the branch to.
-     * \param branch_stream the stream of the branch to join.
-     */
-    void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
-};
-
 /**
  * A class that manages CUDA Graphs.
  */
@@ -319,6 +273,46 @@ template <typename F, typename... Args> class Graph {
     }
 };
 
+/**
+ * A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
+ */
+struct GraphManager {
+    std::vector<void*> mHostData;
+    std::vector<Event*> mEvents;
+
+    ~GraphManager();
+
+    /**
+     * Within a function that is being stream captured, launch a host function that can
+     * be captured into the graph.
+     */
+
+    template <typename F, typename... Args>
+    void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
+#ifdef CUDACC
+        FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
+        mHostData.push_back((void*)fh);
+        cudaHostFn_t run_func = fh->run;
+        CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
+#else
+        func(args...);
+#endif
+    }
+
+    /**
+     * Makes a new branch in the graph to be run in parallel by a new stream.
+     * \param orig_stream the original stream to branch from.
+     * \param branch_stream the stream of the new branch.
+     */
+    void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
+    /**
+     * Joins a existing branch in the graph to collapse a parallel block.
+     * \param orig_stream the original stream to join the branch to.
+     * \param branch_stream the stream of the branch to join.
+     */
+    void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
+};
+
 }; // namespace CudaTools
 
 #ifdef CUDATOOLS_IMPLEMENTATION
@@ -357,27 +351,17 @@ void free(void* const pDevice) {
 #endif
 }
 
-StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
-#ifdef CUDACC
-    CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
-                               Manager::get()->stream(stream)));
-#endif
-    return stream;
-}
-
-StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
+StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
 #ifdef CUDACC
-    CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
-                               Manager::get()->stream(stream)));
+    CUDA_CHECK(
+        cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
 #endif
     return stream;
 }
 
-StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
-                    const StreamID& stream) {
+StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
 #ifdef CUDACC
-    CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
-                               Manager::get()->stream(stream)));
+    CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
 #endif
     return stream;
 }
@@ -661,11 +645,11 @@ void Event::record(const StreamID& stream) {
 #endif
 }
 
-////////////////////////
-// GraphTools Methods //
-////////////////////////
+//////////////////////////
+// GraphManager Methods //
+//////////////////////////
 
-GraphTools::~GraphTools() {
+GraphManager::~GraphManager() {
 #ifdef CUDACC
     for (void* func : mHostData) {
         delete func;
@@ -676,14 +660,14 @@ GraphTools::~GraphTools() {
 #endif
 }
 
-void GraphTools::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
+void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
     Event* event = new Event();
     event->record(orig_stream);
     mEvents.push_back(event);
     branch_stream.wait(*event);
 }
 
-void GraphTools::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
+void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
     Event* event = new Event();
     event->record(branch_stream);
     mEvents.push_back(event);
diff --git a/Macros.h b/Macros.h
index 9290f8f..1586404 100644
--- a/Macros.h
+++ b/Macros.h
@@ -46,22 +46,6 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
  */
 #define SHARED
 
-/**
- * \def DECLARE_KERNEL(call, ...)
- * Used to declare (in header) a CUDA kernel.
- * \param call the name of the kernel
- * \param ... the arguments of the kernel
- */
-#define DECLARE_KERNEL(call, ...)
-
-/**
- * \def DEFINE_KERNEL(call, ...)
- * Used to define (in implementation) a CUDA kernel.
- * \param call the name of the kernel
- * \param ... the arguments of the kernel
- */
-#define DEFINE_KERNEL(call, ...)
-
 /**
  * \def KERNEL(call, settings, ...)
  * Used to call a CUDA kernel.
@@ -178,11 +162,11 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #define UPDATE_FUNC(name)                                                                          \
     inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream =                      \
                                               CudaTools::DEF_MEM_STREAM) {                         \
-        return CudaTools::pull(this, that(), sizeof(name));                                        \
+        return CudaTools::copy(that(), this, sizeof(name));                                        \
     };                                                                                             \
     inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream =                    \
                                                 CudaTools::DEF_MEM_STREAM) {                       \
-        return CudaTools::push(this, that(), sizeof(name));                                        \
+        return CudaTools::copy(this, that(), sizeof(name));                                        \
     }
 
 #ifdef CUDA
diff --git a/docs/source/core.rst b/docs/source/core.rst
index f698e4d..47f7aba 100644
--- a/docs/source/core.rst
+++ b/docs/source/core.rst
@@ -2,8 +2,9 @@
 Core.h
 ======
 
-The ``Core.h`` header file defines some useful types and some macros along with
-a few core classes.
+The ``Core.h`` header file defines some useful types and some macros functions
+to faciliate the dual CPU-CUDA compilation targets. Additionally, it introduces
+several classes to enable the usage of CUDA streams, kernels, and graphs.
 
 Types
 =====
@@ -33,10 +34,6 @@ Compilation Options
 Macro Functions
 ===============
 
-Kernel
-------
-.. doxygendefine:: DECLARE_KERNEL
-.. doxygendefine:: DEFINE_KERNEL
 .. doxygendefine:: KERNEL
 
 Device Helpers
@@ -50,7 +47,21 @@ Device Class
 .. doxygendefine:: DEVICE_CLASS
 
 
-Classes and Structs
+Memory Functions
+================
+
+.. doxygenfunction:: CudaTools::malloc
+
+.. doxygenfunction:: CudaTools::free
+
+.. doxygenfunction:: CudaTools::copy
+
+.. doxygenfunction:: CudaTools::memset
+
+.. doxygenfunction:: CudaTools::pin
+
+
+Streams and Handles
 ===================
 
 CudaTools::StreamID
@@ -64,12 +75,29 @@ CudaTools::Manager
 .. doxygenclass:: CudaTools::Manager
     :members:
 
+Kernels
+=======
+
+.. doxygenfunction:: CudaTools::Kernel::launch
+
+.. doxygenfunction:: CudaTools::Kernel::basic
+
 CudaTools::Kernel::Settings
 ---------------------------
 
 .. doxygenstruct:: CudaTools::Kernel::Settings
     :members:
 
-CudaTools::Kernel::Basic
-------------------------
-.. doxygenfunction:: CudaTools::Kernel::basic
+               
+Graphs
+======
+
+CudaTools::Graph
+----------------
+.. doxygenclass:: CudaTools::Graph
+    :members:
+
+CudaTools::GraphManager
+-----------------------
+.. doxygenstruct:: CudaTools::GraphManager
+    :members:
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 372e557..7d8b0e7 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -33,12 +33,12 @@ macros provided. For example,
 
 .. code-block:: cpp
 
-    DEFINE_KERNEL(add, int x, int y) {
+    KERNEL(add, int x, int y) {
         printf("Kernel: %i\n", x + y);
     }
 
     int main() {
-        KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
+        CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
         return 0;
     }
 
@@ -79,13 +79,13 @@ being device-compatible. We follow the previous example in a similar fashion.
             };
     };
 
-    DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
+    KERNEL(swap, intPair* const pair) { pair->swap(); }
 
     int main() {
         intPair pair(1, 2);
         printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
 
-        KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
+        CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
         pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
 
         printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
@@ -129,7 +129,7 @@ compiled for CPU, then everything will run synchronously, as per usual.
 
 Array Examples
 ==============
-This file introduces the ``Array`` class, which is a class that provides automatic
+The ``Array.h`` file introduces the ``Array`` class, which is a class that provides automatic
 memory management between device and host. In particular, it provides functionality on
 both the host and device while handling proper memory destruction, with many nice
 features. In particular it supports mimics many features of the Python package NumPy.`
@@ -137,12 +137,12 @@ We can demonstrate a few here.
 
 .. code-block:: cpp
 
-    DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
+    KERNEL(times2, const CudaTools::Array<int> arr) {
         CudaTools::Array<int> flat = arr.flattened();
         BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
     }
 
-    DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
+    KERNEL(times2double, const CudaTools::Array<double> arr) {
         CudaTools::Array<double> flat = arr.flattened();
         BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
     }
@@ -165,10 +165,10 @@ We can demonstrate a few here.
         // Call the kernel multiple times asynchronously. Note: since they share same
         // stream, they are not run in parallel, just queued on the device.
         // NOTE: Notice that a view is passed into the kernel, not the Array itself.
-        KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
-        KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
-        KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
-        KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
+        CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
+        CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
+        CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
+        CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
         arrRange.updateHost();
         arrConst.updateHost();
         arrLinspace.updateHost();
@@ -239,14 +239,80 @@ view manually with the ``.view()`` function.
    programmer to manage this.
 
 
+Graph Examples
+==============
+Additionally, there is support for CUDA Graphs, a way of defining a series of kernel
+launches and executing later, potentially reducing overhead and timing, as well as
+control the specific parallel workflow between CPU and GPU. The following
+snippet illustrates this
+
+.. code-block:: cpp
+
+    void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
+                 const CudaTools::Array<uint32_t> B) {
+        A.updateDevice("graphStream");
+        gm->makeBranch("graphStream", "graphStreamBranch");
+        B.updateDevice("graphStreamBranch");
+        for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
+            CudaTools::Kernel::launch(
+                collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
+            CudaTools::Kernel::launch(
+                plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
+        }
+
+        gm->joinBranch("graphStream", "graphStreamBranch");
+        CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
+                                  A.view(), B.view());
+        A.updateHost("graphStream");
+        B.updateHost("graphStream");
+        gm->launchHostFunction("graphStream", addNum, A.view(), 5);
+    }
+
+    int main() {
+        CudaTools::Manager::get()->addStream("graphStream");
+        CudaTools::Manager::get()->addStream("graphStreamBranch");
+
+        CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
+        CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
+
+        CudaTools::GraphManager gm;
+        CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
+        TIME(graph.execute().wait(), ExecuteGraph);
+
+        std::cout << A.slice({{0, 10}}) << "\n";
+        return 0;
+    }
+
+We first create two new streams to be used in the graph, which define the different parallel
+streams used. To use CUDA Graphs in CudaTools, we expect the graph to be created from a function, which
+should be written as if it will be executed. Note that we do not need to use ``.wait()`` here, since the function
+is intended to be captured into the graph. The capture process is done on the creation of the graph, with
+the name of the origin stream, the function name, and the arguments of the function. Afterwards,
+simply run ``graph.execute()`` to execute the captured graph. On CPU, it will simply run the function.
+
+To access the other functionality like graph branching an capturing host functions, it is
+necessary to use the ``CudaTools::GraphManager`` class, which stores a variety of necessary variables
+that need to be kept during the lifetime of the graph execution. **Currently, launching host functions sometimes alters the correct blocking of the stream, in particular with copying. It is not yet known if this is an issue with the library or a technicality within CUDA Graphs itself that needs some special care to resolve.** To read more about the syntax, see :ref:`here <CudaTools::GraphManager>`.
+
+.. warning::
+
+   A graph capture essentially 'freezes' the variables used in the capture, like
+   function arguments. As a result, the programmer must take care that the variables
+   are well-defined. This is especially relevant to variables on the heap, where you need
+   to make sure the variable is not a copy. Potentially, always using pointers could work,
+   but is not always necessary. Likely ``.view()`` should always be used when dealing with
+   ``CudaTools::Array`` objects.
+
+
 BLAS Examples
 =============
 
 
 Compilation and Linking
 =======================
-To compile with this library, there are only a few things necessary.
-First, it is recommended you use the provided template ``Makefile``, which can be
+To compile with this library, there are only a few things necessary. First, this library depends on
+`Eigen 3.4.0+ <https://eigen.tuxfamily.org/index.php?title=Main_Page>`__, and must be
+compiled with C++17. Next, it is recommended you use the provided template ``Makefile``, which can be
 easily modified to suit your project needs. It already default handles the compilation
 and linking with ``nvcc``, so long as you fulfill a few requirements.
 
@@ -283,7 +349,7 @@ file for the first example:
     CC := g++-10
     NVCC := nvcc
     CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-    NVCC_FLAGS := -MMD -w -Xcompiler
+    NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
 
     INCLUDE := ../../
     LIBS_DIR :=
diff --git a/samples/1_CoreKernel/Makefile b/samples/1_CoreKernel/Makefile
index 09436ea..6f7f454 100644
--- a/samples/1_CoreKernel/Makefile
+++ b/samples/1_CoreKernel/Makefile
@@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -w -Xcompiler
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
 
 INCLUDE := ../../
 LIBS_DIR :=
diff --git a/samples/1_CoreKernel/main.cu.cpp b/samples/1_CoreKernel/main.cu.cpp
index e8e8519..c225482 100644
--- a/samples/1_CoreKernel/main.cu.cpp
+++ b/samples/1_CoreKernel/main.cu.cpp
@@ -1,12 +1,9 @@
 #define CUDATOOLS_IMPLEMENTATION
 #include <Core.h>
 
-DEFINE_KERNEL(add, int x, int y) {
-    printf("Kernel: %i\n", x + y);
-}
+KERNEL(add, int x, int y) { printf("Kernel: %i\n", x + y); }
 
 int main() {
-    KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
+    CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
     return 0;
 }
-
diff --git a/samples/2_CoreClass/Makefile b/samples/2_CoreClass/Makefile
index f0c9f94..431fb52 100644
--- a/samples/2_CoreClass/Makefile
+++ b/samples/2_CoreClass/Makefile
@@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -w -Xcompiler
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
 
 INCLUDE := ../../
 LIBS_DIR :=
diff --git a/samples/2_CoreClass/main.cu.cpp b/samples/2_CoreClass/main.cu.cpp
index 99e588c..e1b0c10 100644
--- a/samples/2_CoreClass/main.cu.cpp
+++ b/samples/2_CoreClass/main.cu.cpp
@@ -3,32 +3,32 @@
 
 class intPair {
     DEVICE_CLASS(intPair)
-    public:
-        int x, y;
-
-        intPair(const int x_, const int y_) : x(x_), y(y_) {
-            allocateDevice(); // Allocates memory for this intPair on the device.
-            updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
-        };
-
-        HD void swap() {
-            int swap = x;
-            x = y;
-            y = swap;
-        };
+  public:
+    int x, y;
+
+    intPair(const int x_, const int y_) : x(x_), y(y_) {
+        allocateDevice(); // Allocates memory for this intPair on the device.
+        updateDevice()
+            .wait(); // Copies the memory on the host to the device and waits until finished.
+    };
+
+    HD void swap() {
+        int swap = x;
+        x = y;
+        y = swap;
+    };
 };
 
-DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
+KERNEL(swap, intPair* const pair) { pair->swap(); }
 
 int main() {
     intPair pair(1, 2);
     printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
 
-    KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
-    pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
+    CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
+    pair.updateHost()
+        .wait(); // Copies the memory from the device back to the host and waits until finished.
 
     printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
     return 0;
 }
-
-
diff --git a/samples/3_ArrayKernel/Makefile b/samples/3_ArrayKernel/Makefile
index 633490a..8ae9bcf 100644
--- a/samples/3_ArrayKernel/Makefile
+++ b/samples/3_ArrayKernel/Makefile
@@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -w -Xcompiler
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
 
 INCLUDE := ../../
 LIBS_DIR :=
diff --git a/samples/3_ArrayKernel/main.cu.cpp b/samples/3_ArrayKernel/main.cu.cpp
index 05f5a3c..6cc75bb 100644
--- a/samples/3_ArrayKernel/main.cu.cpp
+++ b/samples/3_ArrayKernel/main.cu.cpp
@@ -2,12 +2,12 @@
 #include <Array.h>
 #include <Core.h>
 
-DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
+KERNEL(times2, const CudaTools::Array<int> arr) {
     CudaTools::Array<int> flat = arr.flattened();
     BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
 }
 
-DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
+KERNEL(times2double, const CudaTools::Array<double> arr) {
     CudaTools::Array<double> flat = arr.flattened();
     BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
 }
@@ -30,10 +30,15 @@ int main() {
     // Call the kernel multiple times asynchronously. Note: since they share same
     // stream, they are not run in parallel, just queued on the device.
     // NOTE: Notice that a view is passed into the kernel, not the Array itself.
-    KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
-    KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
-    KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
-    KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
+    CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()),
+                              arrRange.view());
+    CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()),
+                              arrConst.view());
+    CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()),
+                              arrLinspace.view());
+    CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()),
+                              arrComma.view())
+        .wait();
     arrRange.updateHost();
     arrConst.updateHost();
     arrLinspace.updateHost();
diff --git a/samples/4_ArrayFunctions/Makefile b/samples/4_ArrayFunctions/Makefile
index 501a3ac..3ac8ce3 100644
--- a/samples/4_ArrayFunctions/Makefile
+++ b/samples/4_ArrayFunctions/Makefile
@@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -w -Xcompiler
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
 
 INCLUDE := ../../
 LIBS_DIR :=
diff --git a/samples/5_SimpleGraph/Makefile b/samples/5_SimpleGraph/Makefile
new file mode 100644
index 0000000..528c55b
--- /dev/null
+++ b/samples/5_SimpleGraph/Makefile
@@ -0,0 +1,95 @@
+CC := g++-10
+NVCC := nvcc
+CFLAGS := -Wall -std=c++17 -fopenmp -MMD
+NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+
+INCLUDE := ../../
+LIBS_DIR :=
+LIBS_DIR_GPU := /usr/local/cuda/lib64
+LIBS :=
+LIBS_GPU := cuda cudart cublas
+
+TARGET = simpleGraph
+SRC_DIR = .
+BUILD_DIR = build
+
+# Should not need to modify below.
+
+CPU_BUILD_DIR = $(BUILD_DIR)/cpu
+GPU_BUILD_DIR = $(BUILD_DIR)/gpu
+
+SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
+
+# Get source files and object files.
+GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
+NVCC_SRC = $(filter %.cu.cpp, $(SRC))
+GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
+
+# If compiling for CPU, all go to GCC. Otherwise, they are split.
+CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
+GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
+GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
+
+# $(info $$GCC_SRC is [${GCC_SRC}])
+# $(info $$NVCC_SRC is [${NVCC_SRC}])
+# $(info $$GCC_OBJ is [${GCC_OBJ}])
+# $(info $$NVCC_OBJ is [${NVCC_OBJ}])
+
+# $(info $$CPU_OBJ is [${CPU_OBJ}])
+# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
+# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
+
+HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
+CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
+GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
+
+INC := $(INCLUDE:%=-I%)
+LIB := $(LIBS_DIR:%=-L%)
+LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
+LD := $(LIBS:%=-l%)
+LD_GPU := $(LIBS_GPU:%=-l%)
+
+# Reminder:
+# $< = first prerequisite
+# $@ = the target which matched the rule
+# $^ = all prerequisites
+
+.PHONY: all clean
+
+all : cpu gpu
+
+cpu: $(TARGET)CPU
+gpu: $(TARGET)GPU
+
+$(TARGET)CPU: $(CPU_OBJ)
+	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
+
+$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
+
+# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
+# regular ones. Then, we link them all together.
+$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
+	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
+
+$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
+	$(NVCC) --device-link $^ -o $@
+
+$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
+	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
+
+$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
+	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
+
+-include $(CPU_DEPS)
+-include $(GPU_DEPS)
+
+$(CPU_BUILD_DIR):
+	mkdir -p $@
+
+$(GPU_BUILD_DIR):
+	mkdir -p $@
+
+clean:
+	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
diff --git a/samples/5_SimpleGraph/main.cu.cpp b/samples/5_SimpleGraph/main.cu.cpp
new file mode 100644
index 0000000..f9b2afb
--- /dev/null
+++ b/samples/5_SimpleGraph/main.cu.cpp
@@ -0,0 +1,106 @@
+#define CUDATOOLS_IMPLEMENTATION
+#include <Array.h>
+#include <Core.h>
+#include <chrono>
+
+#define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
+
+#define TIME_END(name)                                                                             \
+    auto end_##name = std::chrono::steady_clock::now();                                            \
+    auto time_ms_##name =                                                                          \
+        std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count();  \
+    auto time_mus_##name =                                                                         \
+        std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count();  \
+    if (time_ms_##name == 0) {                                                                     \
+        printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name);                            \
+    } else {                                                                                       \
+        printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name);                             \
+    }
+
+#define TIME(call, name)                                                                           \
+    TIME_START(name);                                                                              \
+    call;                                                                                          \
+    TIME_END(name);
+
+KERNEL(collatz, const CudaTools::Array<uint32_t> arr) {
+    BASIC_LOOP(arr.shape().length()) {
+        if (arr[iThread] % 2) {
+            arr[iThread] = 3 * arr[iThread] + 1;
+        } else {
+            arr[iThread] = arr[iThread] >> 1;
+        }
+    }
+}
+
+KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) {
+    BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
+}
+
+KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) {
+    BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
+}
+
+void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) {
+    auto Aeig = A.atLeast2D().eigenMap();
+    Aeig = Aeig.array() + num;
+}
+
+void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) {
+    A.updateDevice("graphStream").wait();
+    B.updateDevice("graphStreamBranch").wait();
+    for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
+        CudaTools::Kernel::launch(
+            collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
+        CudaTools::Kernel::launch(
+            plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
+    }
+
+    CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
+                              A.view(), B.view())
+        .wait();
+    A.updateHost("graphStream");
+    B.updateHost("graphStream").wait();
+    addNum(A.view(), 5);
+}
+
+void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
+             const CudaTools::Array<uint32_t> B) {
+    A.updateDevice("graphStream");
+    gm->makeBranch("graphStream", "graphStreamBranch");
+    B.updateDevice("graphStreamBranch");
+    for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
+        CudaTools::Kernel::launch(
+            collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
+        CudaTools::Kernel::launch(
+            plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
+    }
+
+    gm->joinBranch("graphStream", "graphStreamBranch");
+    CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
+                              A.view(), B.view());
+    A.updateHost("graphStream");
+    B.updateHost("graphStream");
+    gm->launchHostFunction("graphStream", addNum, A.view(), 5);
+}
+
+int main() {
+    CudaTools::Manager::get()->addStream("graphStream");
+    CudaTools::Manager::get()->addStream("graphStreamBranch");
+
+    CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
+    CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
+
+    TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
+
+    std::cout << A.slice({{0, 10}}) << "\n";
+
+    A.setConstant(50);
+    B.setConstant(0);
+
+    CudaTools::GraphManager gm;
+    CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
+    TIME(graph.execute().wait(), ExecuteGraph);
+
+    std::cout << A.slice({{0, 10}}) << "\n";
+    return 0;
+}
diff --git a/tests.cu.cpp b/tests.cu.cpp
index 0d9532e..c224bd6 100644
--- a/tests.cu.cpp
+++ b/tests.cu.cpp
@@ -117,7 +117,7 @@ KERNEL(plusOne, const CT::Array<uint32_t> arr) {
     BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
 }
 
-KERNEL(addBoth, const CT::Array<uint32_t> a, const CT::Array<uint32_t> b) {
+KERNEL(addArray, const CT::Array<uint32_t> a, const CT::Array<uint32_t> b) {
     BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
 }
 
@@ -491,29 +491,28 @@ template <typename T> uint32_t doBLASTests() {
     return failed;
 }
 
-void myHostFunc(const CT::Array<uint32_t> A, uint32_t num) {
+void addNum(const CT::Array<uint32_t> A, uint32_t num) {
     auto Aeig = A.atLeast2D().eigenMap();
     Aeig = Aeig.array() + num;
 }
 
-void myBasicGraph(CT::GraphTools* tools, CT::Array<uint32_t>* A, CT::Array<uint32_t>* B) {
+void myGraph(CT::GraphManager* gm, const CT::Array<uint32_t> A, const CT::Array<uint32_t> B) {
     // tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
-    A->updateDevice("graphStream");
-    tools->makeBranch("graphStream", "graphStreamBranch");
-    B->updateDevice("graphStreamBranch");
+    A.updateDevice("graphStream");
+    gm->makeBranch("graphStream", "graphStreamBranch");
+    B.updateDevice("graphStreamBranch");
     for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
-        CT::Kernel::launch(collatz, CT::Kernel::basic(A->shape().items(), "graphStream"),
-                           A->view());
-        CT::Kernel::launch(plusOne, CT::Kernel::basic(A->shape().items(), "graphStreamBranch"),
-                           B->view());
+        CT::Kernel::launch(collatz, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view());
+        CT::Kernel::launch(plusOne, CT::Kernel::basic(A.shape().items(), "graphStreamBranch"),
+                           B.view());
     }
 
-    tools->joinBranch("graphStream", "graphStreamBranch");
-    CT::Kernel::launch(addBoth, CT::Kernel::basic(A->shape().items(), "graphStream"), A->view(),
-                       B->view());
-    A->updateHost("graphStream");
-    B->updateHost("graphStream");
-    tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
+    gm->joinBranch("graphStream", "graphStreamBranch");
+    CT::Kernel::launch(addArray, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view(),
+                       B.view());
+    A.updateHost("graphStream");
+    B.updateHost("graphStream");
+    gm->launchHostFunction("graphStream", addNum, A.view(), 5);
 }
 
 uint32_t doGraphTest() {
@@ -523,8 +522,8 @@ uint32_t doGraphTest() {
     CT::Manager::get()->addStream("graphStream");
     CT::Manager::get()->addStream("graphStreamBranch");
 
-    CT::GraphTools tools;
-    CT::Graph graph("graphStream", myBasicGraph, &tools, &A, &B);
+    CT::GraphManager gm;
+    CT::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
     graph.execute().wait();
 
     uint32_t errors = 0;