18 changed files with 172 additions and 679 deletions
--- a/Array.h
+++ b/Array.h
@ -7,7 +7,6 @@
 #include <Eigen/Dense>
 #include <cmath>
 #include <complex>
 #include <cstdlib>
 #include <iomanip>
 #include <random>
 #include <type_traits>
@ -556,7 +555,7 @@ template <typename T> class Array {
        }
 #ifndef DEVICE
        if (pDevice != nullptr) {
-            CudaTools::copy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
+            CudaTools::deviceCopy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
        }
 #endif
        return arr;
@ -739,8 +738,9 @@ template <typename T> class Array {
     * \brief Host only
     */
    StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
-        CT_ERROR(mIsSlice, "Cannot update host copy on a slice");
+        CT_ERROR(mIsView, "Cannot update host on a view");
-        return CudaTools::copy(pDevice, pHost, mShape.items() * sizeof(T), stream);
+        CudaTools::pull(pHost, pDevice, mShape.items() * sizeof(T), stream);
        return stream;
    };
    /**
@ -748,8 +748,9 @@ template <typename T> class Array {
     * \brief Host only
     */
    StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
-        CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
+        CT_ERROR(mIsView, "Cannot update device on a view");
-        return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
+        CudaTools::push(pHost, pDevice, mShape.items() * sizeof(T), stream);
        return stream;
    };
 };
@ -787,16 +788,12 @@ void printAxis(std::ostream& out, const Array<T>& arr, const uint32_t axis, size
 template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>& arr) {
    size_t width = 0;
-    if constexpr (is_int<T>) {
+    if constexpr (is_num<T>) {
        T max_val = 0;
        bool negative = false;
        for (auto it = arr.begin(); it != arr.end(); ++it) {
-            T val = *it;
+            if (*it < 0) negative = true;
-            if (*it < 0) {
+            max_val = (abs(*it) > max_val) ? abs(*it) : max_val;
                negative = true;
                val *= -1;
            }
            max_val = (val > max_val) ? val : max_val;
        }
        width = std::to_string(max_val).size() + 1;
        width += (negative) ? 1 : 0;
--- a/BLAS.h
+++ b/BLAS.h
@ -235,7 +235,8 @@ StreamID GEMV(const T alpha, const Array<T>& A, const Array<T>& x, const T beta,
    uint32_t cols = A.shape().cols();
    T a = alpha, b = beta;
 #ifdef CUDA
-    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
+    CUBLAS_CHECK(
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
    if (bi.size == 1) {
        invoke<T>(cublasSgemv, cublasDgemv, cublasCgemv, cublasZgemv,
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, rows, cols, CAST(&a),
@ -281,7 +282,8 @@ StreamID GEMM(const T alpha, const Array<T>& A, const Array<T>& B, const T beta,
    T a = alpha, b = beta;
 #ifdef CUDA
-    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
+    CUBLAS_CHECK(
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
    if (bi.size == 1) {
        invoke<T>(cublasSgemm, cublasDgemm, cublasCgemm, cublasZgemm,
                  Manager::get()->cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, CAST(&a),
@ -336,7 +338,8 @@ StreamID DGMM(const Array<T>& A, const Array<T>& X, const Array<T>& C, const boo
    uint32_t m = C.shape().rows();
    uint32_t n = C.shape().cols();
    auto mode = (left) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
-    CUBLAS_CHECK(cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
+    CUBLAS_CHECK(
        cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
    invoke<T>(cublasSdgmm, cublasDdgmm, cublasCdgmm, cublasZdgmm, Manager::get()->cublasHandle(), m,
              n, CAST(A.dataDevice()), A.shape().rows(), CAST(X.dataDevice()), 1,
              CAST(C.dataDevice()), m);
@ -511,7 +514,7 @@ class PLUBatch : public Batch<T> {
 #ifdef CUDA
        uint32_t n = this->mShape.rows();
        CUBLAS_CHECK(
-            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
+            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
        invoke<T>(cublasSgetrfBatched, cublasDgetrfBatched, cublasCgetrfBatched,
                  cublasZgetrfBatched, Manager::get()->cublasHandle(), n,
                  DCAST(this->mBatch.dataDevice()), n, mPivotsBatch.dataDevice(),
@ -543,7 +546,7 @@ class PLUBatch : public Batch<T> {
        uint32_t n = b.shape().rows();
        uint32_t nrhs = b.shape().cols();
        CUBLAS_CHECK(
-            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream)));
+            cublasSetStream(Manager::get()->cublasHandle(), Manager::get()->stream(stream.id)));
        invoke<T>(cublasSgetrsBatched, cublasDgetrsBatched, cublasCgetrsBatched,
                  cublasZgetrsBatched, Manager::get()->cublasHandle(), CUBLAS_OP_N, n, nrhs,
                  DCAST(this->mBatch.dataDevice()), n, mPivotsBatch.dataDevice(),
--- a/Core.h
+++ b/Core.h
@ -2,16 +2,13 @@
 #define CUDATOOLS_H
 #include "Macros.h"
 #include <functional>
 #include <iostream>
 #include <string>
 #include <tuple>
 #include <unordered_map>
 #include <vector>
 namespace CudaTools {
 struct Event;
 /**
 * Simple wrapper for the name of a stream. Its purposes is to allow for
 * 'streams' to be passed on host code, and allowing for simple syntax
@ -19,19 +16,18 @@ struct Event;
 */
 struct StreamID {
  public:
-    std::string mId;
+    std::string id;
-    StreamID() : mId(""){};
+    StreamID() : id(""){};
    /**
     * The constructor for a StreamID.
     */
-    StreamID(const std::string& id_) : mId(id_){};
+    StreamID(const std::string& id_) : id(id_){};
-    StreamID(const char* id_) : mId(id_){};
+    StreamID(const char* id_) : id(id_){};
    void wait() const; /**< Makes host wait for this stream. */
    /**
-     * Makes this stream wait for this event. Does not block the host.
+     * Waits for the stream with this stream ID.
     */
-    void wait(const Event& event) const;
+    void wait() const;
 };
 static const StreamID DEF_MEM_STREAM = StreamID{"defaultMemory"};
@ -44,24 +40,30 @@ static const StreamID DEF_KERNEL_STREAM = StreamID{"defaultKernel"};
 void* malloc(const size_t size);
 /**
- * Frees memory on the device.
+ * Pins memory on the host.
 */
-void free(void* const pDevice);
+void pin(void* const pHost, const size_t size);
 /**
- * Copies memory from the source pointer to the dest pointer.
+ * Pushes memory from the device to the host.
 */
-StreamID copy(void* const source, void* const dest, const size_t size,
+StreamID push(void* const pHost, void* const pDevice, const size_t size,
              const StreamID& stream = DEF_MEM_STREAM);
 /**
- * Initializes or sets device memory to a value.
+ * Pulls memory from the device back to the host.
 */
-StreamID memset(void* const pDevice, int value, const size_t size,
+StreamID pull(void* const pHost, void* const pDevice, const size_t size,
              const StreamID& stream = DEF_MEM_STREAM);
 /**
- * Pins memory on the host.
+ * Copies memory on the device to another location on the device.
 */
-void pin(void* const pHost, const size_t size);
+StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
                    const StreamID& stream = DEF_MEM_STREAM);
 /**
 * Frees memory on the device.
 */
 void free(void* const pDevice);
 #ifdef CUDACC
 cudaDeviceProp getDeviceProp();
@ -135,20 +137,6 @@ struct Settings {
 */
 Settings basic(const size_t threads, const StreamID& stream = DEF_KERNEL_STREAM);
 /**
 * Launches a kernel with the provided function, settings and its arguments.
 */
 template <typename F, typename... Args>
 StreamID launch(F func, const Kernel::Settings& sett, Args... args) {
 #ifdef CUDA
    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
           Manager::get()->stream(sett.stream.mId)>>>(args...);
 #else
    func(args...);
 #endif
    return sett.stream;
 }
 }; // namespace Kernel
 template <typename T> class Array;
@ -175,12 +163,12 @@ class Shape {
    HD uint32_t axes() const;  /**< Gets the number of axes. */
    HD uint32_t items() const; /**< Gets the total number of items. */
-    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the
+    HD uint32_t length() const; /**< For 1D shapes, gets the length. In general, gets the dimension
-                                   dimension of the last axis. */
+                                   of the last axis. */
    HD uint32_t rows() const;   /**< For 2D shapes, gets the number of rows. In general, gets the
                                   dimension of the second to last axis. */
-    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets
+    HD uint32_t cols() const;   /**< For 2D shapes, gets the number of columns. In general, gets the
-                                   the dimension of the second to last axis. */
+                                   dimension of the second to last axis. */
    HD uint32_t
    dim(const uint32_t axis) const; /**< Gets the dimension size of the specified axis. */
@ -198,143 +186,29 @@ class Shape {
 std::ostream& operator<<(std::ostream& out, const Shape& s);
 /**
 * A simple class that manages a CUDA Event.
 */
 struct Event {
 #ifdef CUDACC
    cudaEvent_t mEvent;
 #endif
    Event();
    ~Event();
    void record(const StreamID& stream); /**< Records a event from a stream. */
 };
 template <typename F, typename... Args> struct FuncHolder {
    F mFunc;
    std::tuple<Args...> mArgs;
    FuncHolder() = delete;
    FuncHolder(F func, Args... args) : mFunc(func), mArgs(std::make_tuple(args...)){};
    static void run(void* data) {
        FuncHolder<F, Args...>* fh = (FuncHolder<F, Args...>*)(data);
        std::apply([fh](auto&&... args) { fh->mFunc(args...); }, fh->mArgs);
    };
 };
 /**
 * A class that manages CUDA Graphs.
 */
 template <typename F, typename... Args> class Graph {
  private:
 #ifdef CUDACC
    cudaGraph_t mGraph;
    cudaGraphExec_t mInstance;
 #endif
    FuncHolder<F, Args...> mFuncHolder;
    StreamID mStream;
  public:
    Graph() = delete;
    /**
     * The constructor for a Graph, which captures the function.
     * \param func the function to capture.
     * \param stream the origin stream to use.
     * \param args the arguments of the function.
     */
    Graph(const StreamID& stream, F func, Args... args)
        : mFuncHolder(func, args...), mStream(stream) {
 #ifdef CUDACC
        CUDA_CHECK(
            cudaStreamBeginCapture(Manager::get()->stream(mStream), cudaStreamCaptureModeGlobal));
        mFuncHolder.run((void*)&mFuncHolder);
        CUDA_CHECK(cudaStreamEndCapture(Manager::get()->stream(mStream), &mGraph));
        CUDA_CHECK(cudaGraphInstantiate(&mInstance, mGraph, NULL, NULL, 0));
 #endif
    };
    ~Graph() {
 #ifdef CUDACC
        CUDA_CHECK(cudaGraphDestroy(mGraph));
        CUDA_CHECK(cudaGraphExecDestroy(mInstance));
 #endif
    };
    /**
     * Executes the instantiated graph, or simply runs the function with provided
     * arguments if compiling for CPU.
     */
    StreamID execute() const {
 #ifdef CUDACC
        cudaGraphLaunch(mInstance, Manager::get()->stream(mStream));
 #else
        mFuncHolder.run((void*)&mFuncHolder);
 #endif
        return mStream;
    }
 };
 /**
 * A struct to facilitate other CUDA Graphs functionality like creating branches and host callbacks.
 */
 struct GraphManager {
    std::vector<void*> mHostData;
    std::vector<Event*> mEvents;
    ~GraphManager();
    /**
     * Within a function that is being stream captured, launch a host function that can
     * be captured into the graph.
     */
    template <typename F, typename... Args>
    void launchHostFunction(const StreamID& stream, F func, Args&&... args) {
 #ifdef CUDACC
        FuncHolder<F, Args...>* fh = new FuncHolder<F, Args...>(func, args...);
        mHostData.push_back((void*)fh);
        cudaHostFn_t run_func = fh->run;
        CUDA_CHECK(cudaLaunchHostFunc(Manager::get()->stream(stream), run_func, fh));
 #else
        func(args...);
 #endif
    }
    /**
     * Makes a new branch in the graph to be run in parallel by a new stream.
     * \param orig_stream the original stream to branch from.
     * \param branch_stream the stream of the new branch.
     */
    void makeBranch(const StreamID& orig_stream, const StreamID& branch_stream);
    /**
     * Joins a existing branch in the graph to collapse a parallel block.
     * \param orig_stream the original stream to join the branch to.
     * \param branch_stream the stream of the branch to join.
     */
    void joinBranch(const StreamID& orig_stream, const StreamID& branch_stream);
 };
 }; // namespace CudaTools
 #ifdef CUDATOOLS_IMPLEMENTATION
 namespace CudaTools {
-//////////////////////
+template <typename T, typename... Args>
-// StreamID Methods //
+StreamID runKernel(T func, const Kernel::Settings& sett, Args... args) {
-//////////////////////
+#ifdef CUDA
-
+    func<<<sett.blockGrid, sett.threadBlock, sett.sharedMemoryBytes,
-void StreamID::wait() const { Manager::get()->waitFor(mId); }
+           Manager::get()->stream(sett.stream.id)>>>(args...);
-
+#else
-void StreamID::wait(const Event& event) const {
+    func(args...);
 #ifdef CUDACC
    CUDA_CHECK(cudaStreamWaitEvent(Manager::get()->stream(mId), event.mEvent, 0));
 #endif
    return sett.stream;
 }
 ////////////////////
 // Memory Methods //
 ////////////////////
 void StreamID::wait() const { Manager::get()->waitFor(id); }
 void* malloc(const size_t size) {
 #ifdef CUDACC
    void* pDevice;
@ -351,17 +225,27 @@ void free(void* const pDevice) {
 #endif
 }
-StreamID copy(void* const source, void* const dest, const size_t size, const StreamID& stream) {
+StreamID push(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
 #ifdef CUDACC
-    CUDA_CHECK(
+    CUDA_CHECK(cudaMemcpyAsync(pDevice, pHost, size, cudaMemcpyHostToDevice,
-        cudaMemcpyAsync(dest, source, size, cudaMemcpyDefault, Manager::get()->stream(stream)));
+                               Manager::get()->stream(stream.id)));
 #endif
    return stream;
 }
-StreamID memset(void* const pDevice, const int value, const size_t size, const StreamID& stream) {
+StreamID pull(void* const pHost, void* const pDevice, const size_t size, const StreamID& stream) {
 #ifdef CUDACC
-    CUDA_CHECK(cudaMemsetAsync(pDevice, value, size, Manager::get()->stream(stream)));
+    CUDA_CHECK(cudaMemcpyAsync(pHost, pDevice, size, cudaMemcpyDeviceToHost,
                               Manager::get()->stream(stream.id)));
 #endif
    return stream;
 }
 StreamID deviceCopy(void* const pSrc, void* const pDest, const size_t size,
                    const StreamID& stream) {
 #ifdef CUDACC
    CUDA_CHECK(cudaMemcpyAsync(pDest, pSrc, size, cudaMemcpyDeviceToDevice,
                               Manager::get()->stream(stream.id)));
 #endif
    return stream;
 }
@ -405,11 +289,11 @@ Manager::~Manager() {
 void Manager::waitFor(const StreamID& stream) const {
 #ifdef CUDACC
-    auto it = mStreams.find(stream.mId);
+    auto it = mStreams.find(stream.id);
    if (it != mStreams.end()) {
        CUDA_CHECK(cudaStreamSynchronize(it->second));
    } else {
-        CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
+        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
    }
 #endif
 }
@ -430,11 +314,11 @@ void Manager::addStream(const std::string& name) {
 #ifdef CUDACC
 cudaStream_t Manager::stream(const StreamID& stream) const {
-    auto it = mStreams.find(stream.mId);
+    auto it = mStreams.find(stream.id);
    if (it != mStreams.end()) {
        return it->second;
    } else {
-        CT_ERROR(true, ("Invalid stream " + stream.mId).c_str());
+        CT_ERROR(true, ("Invalid stream " + stream.id).c_str());
    }
 }
@ -523,7 +407,7 @@ void Settings::setSharedMemSize(const size_t bytes) {
 void Settings::setStream(const StreamID& stream_) {
 #ifdef CUDACC
-    stream = stream_;
+    stream.id = stream_.id;
 #endif
 }
@ -541,8 +425,7 @@ Settings basic(const size_t threads, const StreamID& stream) {
 #endif
    return sett;
 }
-
+} // namespace Kernel
 }; // namespace Kernel
 /////////////////////
 // Shape Functions //
@ -623,57 +506,6 @@ std::ostream& operator<<(std::ostream& out, const Shape& s) {
    return out << s.dim(s.axes() - 1) << ")";
 }
 ///////////////////
 // Event Methods //
 ///////////////////
 Event::Event() {
 #ifdef CUDACC
    CUDA_CHECK(cudaEventCreate(&mEvent));
 #endif
 }
 Event::~Event() {
 #ifdef CUDACC
    CUDA_CHECK(cudaEventDestroy(mEvent));
 #endif
 }
 void Event::record(const StreamID& stream) {
 #ifdef CUDACC
    CUDA_CHECK(cudaEventRecord(mEvent, Manager::get()->stream(stream)));
 #endif
 }
 //////////////////////////
 // GraphManager Methods //
 //////////////////////////
 GraphManager::~GraphManager() {
 #ifdef CUDACC
    for (void* func : mHostData) {
        delete func;
    }
    for (Event* event : mEvents) {
        delete event;
    }
 #endif
 }
 void GraphManager::makeBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
    Event* event = new Event();
    event->record(orig_stream);
    mEvents.push_back(event);
    branch_stream.wait(*event);
 }
 void GraphManager::joinBranch(const StreamID& orig_stream, const StreamID& branch_stream) {
    Event* event = new Event();
    event->record(branch_stream);
    mEvents.push_back(event);
    orig_stream.wait(*event);
 }
 #ifdef CUDACC
 const char* cublasGetErrorString(cublasStatus_t error) {
    switch (error) {
@ -705,6 +537,7 @@ const char* cublasGetErrorString(cublasStatus_t error) {
    return "<unknown>";
 }
 #endif
 };     // namespace CudaTools
 #endif // CUDATOOLS_IMPLEMENTATION
--- a/Macros.h
+++ b/Macros.h
@ -46,6 +46,22 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 */
 #define SHARED
 /**
 * \def DECLARE_KERNEL(call, ...)
 * Used to declare (in header) a CUDA kernel.
 * \param call the name of the kernel
 * \param ... the arguments of the kernel
 */
 #define DECLARE_KERNEL(call, ...)
 /**
 * \def DEFINE_KERNEL(call, ...)
 * Used to define (in implementation) a CUDA kernel.
 * \param call the name of the kernel
 * \param ... the arguments of the kernel
 */
 #define DEFINE_KERNEL(call, ...)
 /**
 * \def KERNEL(call, settings, ...)
 * Used to call a CUDA kernel.
@ -129,17 +145,27 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #define HD __host__ __device__
 #define SHARED __shared__
-#define KERNEL(call, ...) __global__ void call(__VA_ARGS__)
+#define DECLARE_KERNEL(call, ...) __global__ void call(__VA_ARGS__)
 #define DEFINE_KERNEL(call, ...)                                                                   \
    template CudaTools::StreamID CudaTools::runKernel(                                             \
        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
    __global__ void call(__VA_ARGS__)
 #else
 #define HD
 #define SHARED
-#define KERNEL(call, ...) void call(__VA_ARGS__)
+#define DECLARE_KERNEL(call, ...) void call(__VA_ARGS__)
 #define DEFINE_KERNEL(call, ...)                                                                   \
    template CudaTools::StreamID CudaTools::runKernel(                                             \
        void (*)(__VA_ARGS__), const CudaTools::Kernel::Settings&, __VA_ARGS__);                   \
    void call(__VA_ARGS__)
 #endif // CUDACC
-//#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
+#define KERNEL(call, settings, ...) CudaTools::runKernel(call, settings, __VA_ARGS__)
 ///////////////////
 // DEVICE MACROS //
@ -162,11 +188,11 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #define UPDATE_FUNC(name)                                                                          \
    inline CudaTools::StreamID updateHost(const CudaTools::StreamID& stream =                      \
                                              CudaTools::DEF_MEM_STREAM) {                         \
-        return CudaTools::copy(that(), this, sizeof(name));                                        \
+        return CudaTools::pull(this, that(), sizeof(name));                                        \
    };                                                                                             \
    inline CudaTools::StreamID updateDevice(const CudaTools::StreamID& stream =                    \
                                                CudaTools::DEF_MEM_STREAM) {                       \
-        return CudaTools::copy(this, that(), sizeof(name));                                        \
+        return CudaTools::push(this, that(), sizeof(name));                                        \
    }
 #ifdef CUDA
@ -192,10 +218,8 @@ using real64 = double; /**< Type alias for 64-bit floating point datatype. */
 #ifndef CUDATOOLS_ARRAY_MAX_AXES
 /**
 * \def CUDATOOLS_ARRAY_MAX_AXES
- * The maximum number of axes/dimensions an
+ * The maximum number of axes/dimensions an CudaTools::Array can have. The default is
- * CudaTools::Array can have. The default is set
+ * set to 4, but can be manully set fit the program needs.
 * to 4, but can be manully set fit the program
 * needs.
 */
 #define CUDATOOLS_ARRAY_MAX_AXES 4
 #endif
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE :=
 LIBS_DIR :=
--- a/README.rst
+++ b/README.rst
@ -31,7 +31,7 @@ After installing the required Python packages
 .. code-block:: bash
-    $ pip install -r requirements.txt
+    $ pip install -r requirements
 you can now run the script
--- a/docs/source/core.rst
+++ b/docs/source/core.rst
@ -2,9 +2,8 @@
 Core.h
 ======
-The ``Core.h`` header file defines some useful types and some macros functions
+The ``Core.h`` header file defines some useful types and some macros along with
-to faciliate the dual CPU-CUDA compilation targets. Additionally, it introduces
+a few core classes.
 several classes to enable the usage of CUDA streams, kernels, and graphs.
 Types
 =====
@ -34,6 +33,10 @@ Compilation Options
 Macro Functions
 ===============
 Kernel
 ------
 .. doxygendefine:: DECLARE_KERNEL
 .. doxygendefine:: DEFINE_KERNEL
 .. doxygendefine:: KERNEL
 Device Helpers
@ -47,21 +50,7 @@ Device Class
 .. doxygendefine:: DEVICE_CLASS
-Memory Functions
+Classes and Structs
 ================
 .. doxygenfunction:: CudaTools::malloc
 .. doxygenfunction:: CudaTools::free
 .. doxygenfunction:: CudaTools::copy
 .. doxygenfunction:: CudaTools::memset
 .. doxygenfunction:: CudaTools::pin
 Streams and Handles
 ===================
 CudaTools::StreamID
@ -75,29 +64,12 @@ CudaTools::Manager
 .. doxygenclass:: CudaTools::Manager
    :members:
 Kernels
 =======
 .. doxygenfunction:: CudaTools::Kernel::launch
 .. doxygenfunction:: CudaTools::Kernel::basic
 CudaTools::Kernel::Settings
 ---------------------------
 .. doxygenstruct:: CudaTools::Kernel::Settings
    :members:
-               
+CudaTools::Kernel::Basic
-Graphs
+------------------------
-======
+.. doxygenfunction:: CudaTools::Kernel::basic
 CudaTools::Graph
 ----------------
 .. doxygenclass:: CudaTools::Graph
    :members:
 CudaTools::GraphManager
 -----------------------
 .. doxygenstruct:: CudaTools::GraphManager
    :members:
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@ -33,12 +33,12 @@ macros provided. For example,
 .. code-block:: cpp
-    KERNEL(add, int x, int y) {
+    DEFINE_KERNEL(add, int x, int y) {
        printf("Kernel: %i\n", x + y);
    }
    int main() {
-        CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
+        KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
        return 0;
    }
@ -79,13 +79,13 @@ being device-compatible. We follow the previous example in a similar fashion.
            };
    };
-    KERNEL(swap, intPair* const pair) { pair->swap(); }
+    DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
    int main() {
        intPair pair(1, 2);
        printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
-        CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
+        KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
        pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
        printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
@ -129,7 +129,7 @@ compiled for CPU, then everything will run synchronously, as per usual.
 Array Examples
 ==============
-The ``Array.h`` file introduces the ``Array`` class, which is a class that provides automatic
+This file introduces the ``Array`` class, which is a class that provides automatic
 memory management between device and host. In particular, it provides functionality on
 both the host and device while handling proper memory destruction, with many nice
 features. In particular it supports mimics many features of the Python package NumPy.`
@ -137,12 +137,12 @@ We can demonstrate a few here.
 .. code-block:: cpp
-    KERNEL(times2, const CudaTools::Array<int> arr) {
+    DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
        CudaTools::Array<int> flat = arr.flattened();
        BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
    }
-    KERNEL(times2double, const CudaTools::Array<double> arr) {
+    DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
        CudaTools::Array<double> flat = arr.flattened();
        BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
    }
@ -165,10 +165,10 @@ We can demonstrate a few here.
        // Call the kernel multiple times asynchronously. Note: since they share same
        // stream, they are not run in parallel, just queued on the device.
        // NOTE: Notice that a view is passed into the kernel, not the Array itself.
-        CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
+        KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
-        CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
+        KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
-        CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
+        KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
-        CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
+        KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
        arrRange.updateHost();
        arrConst.updateHost();
        arrLinspace.updateHost();
@ -239,80 +239,14 @@ view manually with the ``.view()`` function.
   programmer to manage this.
 Graph Examples
 ==============
 Additionally, there is support for CUDA Graphs, a way of defining a series of kernel
 launches and executing later, potentially reducing overhead and timing, as well as
 control the specific parallel workflow between CPU and GPU. The following
 snippet illustrates this
 .. code-block:: cpp
    void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
                 const CudaTools::Array<uint32_t> B) {
        A.updateDevice("graphStream");
        gm->makeBranch("graphStream", "graphStreamBranch");
        B.updateDevice("graphStreamBranch");
        for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
            CudaTools::Kernel::launch(
                collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
            CudaTools::Kernel::launch(
                plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
        }
        gm->joinBranch("graphStream", "graphStreamBranch");
        CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
                                  A.view(), B.view());
        A.updateHost("graphStream");
        B.updateHost("graphStream");
        gm->launchHostFunction("graphStream", addNum, A.view(), 5);
    }
    int main() {
        CudaTools::Manager::get()->addStream("graphStream");
        CudaTools::Manager::get()->addStream("graphStreamBranch");
        CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
        CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
        CudaTools::GraphManager gm;
        CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
        TIME(graph.execute().wait(), ExecuteGraph);
        std::cout << A.slice({{0, 10}}) << "\n";
        return 0;
    }
 We first create two new streams to be used in the graph, which define the different parallel
 streams used. To use CUDA Graphs in CudaTools, we expect the graph to be created from a function, which
 should be written as if it will be executed. Note that we do not need to use ``.wait()`` here, since the function
 is intended to be captured into the graph. The capture process is done on the creation of the graph, with
 the name of the origin stream, the function name, and the arguments of the function. Afterwards,
 simply run ``graph.execute()`` to execute the captured graph. On CPU, it will simply run the function.
 To access the other functionality like graph branching an capturing host functions, it is
 necessary to use the ``CudaTools::GraphManager`` class, which stores a variety of necessary variables
 that need to be kept during the lifetime of the graph execution. **Currently, launching host functions sometimes alters the correct blocking of the stream, in particular with copying. It is not yet known if this is an issue with the library or a technicality within CUDA Graphs itself that needs some special care to resolve.** To read more about the syntax, see :ref:`here <CudaTools::GraphManager>`.
 .. warning::
   A graph capture essentially 'freezes' the variables used in the capture, like
   function arguments. As a result, the programmer must take care that the variables
   are well-defined. This is especially relevant to variables on the heap, where you need
   to make sure the variable is not a copy. Potentially, always using pointers could work,
   but is not always necessary. Likely ``.view()`` should always be used when dealing with
   ``CudaTools::Array`` objects.
 BLAS Examples
 =============
 Compilation and Linking
 =======================
-To compile with this library, there are only a few things necessary. First, this library depends on
+To compile with this library, there are only a few things necessary.
-`Eigen 3.4.0+ <https://eigen.tuxfamily.org/index.php?title=Main_Page>`__, and must be
+First, it is recommended you use the provided template ``Makefile``, which can be
 compiled with C++17. Next, it is recommended you use the provided template ``Makefile``, which can be
 easily modified to suit your project needs. It already default handles the compilation
 and linking with ``nvcc``, so long as you fulfill a few requirements.
@ -349,7 +283,7 @@ file for the first example:
    CC := g++-10
    NVCC := nvcc
    CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-    NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+    NVCC_FLAGS := -MMD -w -Xcompiler
    INCLUDE := ../../
    LIBS_DIR :=
--- a/samples/1_CoreKernel/Makefile
+++ b/samples/1_CoreKernel/Makefile
@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE := ../../
 LIBS_DIR :=
--- a/samples/1_CoreKernel/main.cu.cpp
+++ b/samples/1_CoreKernel/main.cu.cpp
@ -1,9 +1,12 @@
 #define CUDATOOLS_IMPLEMENTATION
 #include <Core.h>
-KERNEL(add, int x, int y) { printf("Kernel: %i\n", x + y); }
+DEFINE_KERNEL(add, int x, int y) {
    printf("Kernel: %i\n", x + y);
 }
 int main() {
-    CudaTools::Kernel::launch(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
+    KERNEL(add, CudaTools::Kernel::basic(1), 1, 1); // Prints 2.
    return 0;
 }
--- a/samples/2_CoreClass/Makefile
+++ b/samples/2_CoreClass/Makefile
@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE := ../../
 LIBS_DIR :=
--- a/samples/2_CoreClass/main.cu.cpp
+++ b/samples/2_CoreClass/main.cu.cpp
@ -8,8 +8,7 @@ class intPair {
        intPair(const int x_, const int y_) : x(x_), y(y_) {
            allocateDevice(); // Allocates memory for this intPair on the device.
-        updateDevice()
+            updateDevice().wait(); // Copies the memory on the host to the device and waits until finished.
            .wait(); // Copies the memory on the host to the device and waits until finished.
        };
        HD void swap() {
@ -19,16 +18,17 @@ class intPair {
        };
 };
-KERNEL(swap, intPair* const pair) { pair->swap(); }
+DEFINE_KERNEL(swap, intPair* const pair) { pair->swap(); }
 int main() {
    intPair pair(1, 2);
    printf("Before: %u, %u\n", pair.x, pair.y); // Prints 1, 2.
-    CudaTools::Kernel::launch(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
+    KERNEL(swap, CudaTools::Kernel::basic(1), pair.that()).wait();
-    pair.updateHost()
+    pair.updateHost().wait(); // Copies the memory from the device back to the host and waits until finished.
        .wait(); // Copies the memory from the device back to the host and waits until finished.
    printf("After: %u, %u\n", pair.x, pair.y); // Prints 2, 1.
    return 0;
 }
--- a/samples/3_ArrayKernel/Makefile
+++ b/samples/3_ArrayKernel/Makefile
@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE := ../../
 LIBS_DIR :=
--- a/samples/3_ArrayKernel/main.cu.cpp
+++ b/samples/3_ArrayKernel/main.cu.cpp
@ -2,12 +2,12 @@
 #include <Array.h>
 #include <Core.h>
-KERNEL(times2, const CudaTools::Array<int> arr) {
+DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
    CudaTools::Array<int> flat = arr.flattened();
    BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
 }
-KERNEL(times2double, const CudaTools::Array<double> arr) {
+DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
    CudaTools::Array<double> flat = arr.flattened();
    BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
 }
@ -30,15 +30,10 @@ int main() {
    // Call the kernel multiple times asynchronously. Note: since they share same
    // stream, they are not run in parallel, just queued on the device.
    // NOTE: Notice that a view is passed into the kernel, not the Array itself.
-    CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrRange.shape().items()),
+    KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
-                              arrRange.view());
+    KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
-    CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrConst.shape().items()),
+    KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
-                              arrConst.view());
+    KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
    CudaTools::Kernel::launch(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()),
                              arrLinspace.view());
    CudaTools::Kernel::launch(times2, CudaTools::Kernel::basic(arrComma.shape().items()),
                              arrComma.view())
        .wait();
    arrRange.updateHost();
    arrConst.updateHost();
    arrLinspace.updateHost();
--- a/samples/4_ArrayFunctions/Makefile
+++ b/samples/4_ArrayFunctions/Makefile
@ -1,7 +1,7 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
-NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
+NVCC_FLAGS := -MMD -w -Xcompiler
 INCLUDE := ../../
 LIBS_DIR :=
--- a/samples/5_SimpleGraph/Makefile
+++ b/samples/5_SimpleGraph/Makefile
@ -1,95 +0,0 @@
 CC := g++-10
 NVCC := nvcc
 CFLAGS := -Wall -std=c++17 -fopenmp -MMD
 NVCC_FLAGS := -MMD -std=c++17 -w -Xcompiler
 INCLUDE := ../../
 LIBS_DIR :=
 LIBS_DIR_GPU := /usr/local/cuda/lib64
 LIBS :=
 LIBS_GPU := cuda cudart cublas
 TARGET = simpleGraph
 SRC_DIR = .
 BUILD_DIR = build
 # Should not need to modify below.
 CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)
 # Get source files and object files.
 GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
 NVCC_SRC = $(filter %.cu.cpp, $(SRC))
 GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)
 # If compiling for CPU, all go to GCC. Otherwise, they are split.
 CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
 GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
 GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))
 # $(info $$GCC_SRC is [${GCC_SRC}])
 # $(info $$NVCC_SRC is [${NVCC_SRC}])
 # $(info $$GCC_OBJ is [${GCC_OBJ}])
 # $(info $$NVCC_OBJ is [${NVCC_OBJ}])
 # $(info $$CPU_OBJ is [${CPU_OBJ}])
 # $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
 # $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])
 HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
 CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
 GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)
 INC := $(INCLUDE:%=-I%)
 LIB := $(LIBS_DIR:%=-L%)
 LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
 LD := $(LIBS:%=-l%)
 LD_GPU := $(LIBS_GPU:%=-l%)
 # Reminder:
 # $< = first prerequisite
 # $@ = the target which matched the rule
 # $^ = all prerequisites
 .PHONY: all clean
 all : cpu gpu
 cpu: $(TARGET)CPU
 gpu: $(TARGET)GPU
 $(TARGET)CPU: $(CPU_OBJ)
 	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)
 $(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -c -o $@ $< $(INC)
 # For GPU, we need to build the NVCC objects, the NVCC linked object, and the
 # regular ones. Then, we link them all together.
 $(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
 	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)
 $(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
 	$(NVCC) --device-link $^ -o $@
 $(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
 	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)
 $(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
 	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)
 -include $(CPU_DEPS)
 -include $(GPU_DEPS)
 $(CPU_BUILD_DIR):
 	mkdir -p $@
 $(GPU_BUILD_DIR):
 	mkdir -p $@
 clean:
 	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
--- a/samples/5_SimpleGraph/main.cu.cpp
+++ b/samples/5_SimpleGraph/main.cu.cpp
@ -1,106 +0,0 @@
 #define CUDATOOLS_IMPLEMENTATION
 #include <Array.h>
 #include <Core.h>
 #include <chrono>
 #define TIME_START(name) auto begin_##name = std::chrono::steady_clock::now()
 #define TIME_END(name)                                                                             \
    auto end_##name = std::chrono::steady_clock::now();                                            \
    auto time_ms_##name =                                                                          \
        std::chrono::duration_cast<std::chrono::milliseconds>(end_##name - begin_##name).count();  \
    auto time_mus_##name =                                                                         \
        std::chrono::duration_cast<std::chrono::microseconds>(end_##name - begin_##name).count();  \
    if (time_ms_##name == 0) {                                                                     \
        printf("[%s] Time Elapsed: %ld[µs]\n", #name, time_mus_##name);                            \
    } else {                                                                                       \
        printf("[%s] Time Elapsed: %ld[ms]\n", #name, time_ms_##name);                             \
    }
 #define TIME(call, name)                                                                           \
    TIME_START(name);                                                                              \
    call;                                                                                          \
    TIME_END(name);
 KERNEL(collatz, const CudaTools::Array<uint32_t> arr) {
    BASIC_LOOP(arr.shape().length()) {
        if (arr[iThread] % 2) {
            arr[iThread] = 3 * arr[iThread] + 1;
        } else {
            arr[iThread] = arr[iThread] >> 1;
        }
    }
 }
 KERNEL(plusOne, const CudaTools::Array<uint32_t> arr) {
    BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
 }
 KERNEL(addArray, const CudaTools::Array<uint32_t> a, const CudaTools::Array<uint32_t> b) {
    BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
 }
 void addNum(const CudaTools::Array<uint32_t> A, uint32_t num) {
    auto Aeig = A.atLeast2D().eigenMap();
    Aeig = Aeig.array() + num;
 }
 void doFunc(const CudaTools::Array<uint32_t> A, const CudaTools::Array<uint32_t> B) {
    A.updateDevice("graphStream").wait();
    B.updateDevice("graphStreamBranch").wait();
    for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
        CudaTools::Kernel::launch(
            collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
        CudaTools::Kernel::launch(
            plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
    }
    CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
                              A.view(), B.view())
        .wait();
    A.updateHost("graphStream");
    B.updateHost("graphStream").wait();
    addNum(A.view(), 5);
 }
 void myGraph(CudaTools::GraphManager* gm, const CudaTools::Array<uint32_t> A,
             const CudaTools::Array<uint32_t> B) {
    A.updateDevice("graphStream");
    gm->makeBranch("graphStream", "graphStreamBranch");
    B.updateDevice("graphStreamBranch");
    for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
        CudaTools::Kernel::launch(
            collatz, CudaTools::Kernel::basic(A.shape().items(), "graphStream"), A.view());
        CudaTools::Kernel::launch(
            plusOne, CudaTools::Kernel::basic(A.shape().items(), "graphStreamBranch"), B.view());
    }
    gm->joinBranch("graphStream", "graphStreamBranch");
    CudaTools::Kernel::launch(addArray, CudaTools::Kernel::basic(A.shape().items(), "graphStream"),
                              A.view(), B.view());
    A.updateHost("graphStream");
    B.updateHost("graphStream");
    gm->launchHostFunction("graphStream", addNum, A.view(), 5);
 }
 int main() {
    CudaTools::Manager::get()->addStream("graphStream");
    CudaTools::Manager::get()->addStream("graphStreamBranch");
    CudaTools::Array<uint32_t> A = CudaTools::Array<uint32_t>::constant({100}, 50);
    CudaTools::Array<uint32_t> B = CudaTools::Array<uint32_t>::constant({100}, 0);
    TIME(doFunc(A.view(), B.view()), ExecuteNoGraph);
    std::cout << A.slice({{0, 10}}) << "\n";
    A.setConstant(50);
    B.setConstant(0);
    CudaTools::GraphManager gm;
    CudaTools::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
    TIME(graph.execute().wait(), ExecuteGraph);
    std::cout << A.slice({{0, 10}}) << "\n";
    return 0;
 }
--- a/tests.cu.cpp
+++ b/tests.cu.cpp
@ -97,36 +97,18 @@ class TestClass {
    };
 };
-KERNEL(times, const CT::Array<int> arr) {
+DEFINE_KERNEL(times, const CT::Array<int> arr) {
    BASIC_LOOP(arr.shape().length()) { arr[iThread] *= 2; }
 }
-KERNEL(classTest, TestClass* const test) { test->x = 100; }
+DEFINE_KERNEL(classTest, TestClass* const test) { test->x = 100; }
 KERNEL(collatz, const CT::Array<uint32_t> arr) {
    BASIC_LOOP(arr.shape().length()) {
        if (arr[iThread] % 2) {
            arr[iThread] = 3 * arr[iThread] + 1;
        } else {
            arr[iThread] = arr[iThread] >> 1;
        }
    }
 }
 KERNEL(plusOne, const CT::Array<uint32_t> arr) {
    BASIC_LOOP(arr.shape().length()) { arr[iThread] += 1; }
 }
 KERNEL(addArray, const CT::Array<uint32_t> a, const CT::Array<uint32_t> b) {
    BASIC_LOOP(a.shape().length()) { a[iThread] += b[iThread]; }
 }
 struct MacroTests {
    static uint32_t Kernel() {
        uint32_t failed = 0;
        CT::Array<int> A = CT::Array<int>::constant({10}, 1);
        A.updateDevice().wait();
-        CT::Kernel::launch(times, CT::Kernel::basic(A.shape().items()), A.view()).wait();
+        KERNEL(times, CT::Kernel::basic(A.shape().items()), A.view()).wait();
        A.updateHost().wait();
        uint32_t errors = 0;
@ -143,7 +125,7 @@ struct MacroTests {
    static uint32_t Class() {
        uint32_t failed = 0;
        TestClass test(1);
-        CT::Kernel::launch(classTest, CT::Kernel::basic(1), test.that()).wait();
+        KERNEL(classTest, CT::Kernel::basic(1), test.that()).wait();
        test.updateHost().wait();
        TEST(test.x == 100, "Class", "Errors: 0");
@ -491,52 +473,6 @@ template <typename T> uint32_t doBLASTests() {
    return failed;
 }
 void addNum(const CT::Array<uint32_t> A, uint32_t num) {
    auto Aeig = A.atLeast2D().eigenMap();
    Aeig = Aeig.array() + num;
 }
 void myGraph(CT::GraphManager* gm, const CT::Array<uint32_t> A, const CT::Array<uint32_t> B) {
    // tools->launchHostFunction("graphStream", myHostFunc, A->view(), 5);
    A.updateDevice("graphStream");
    gm->makeBranch("graphStream", "graphStreamBranch");
    B.updateDevice("graphStreamBranch");
    for (uint32_t iTimes = 0; iTimes < 30; ++iTimes) {
        CT::Kernel::launch(collatz, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view());
        CT::Kernel::launch(plusOne, CT::Kernel::basic(A.shape().items(), "graphStreamBranch"),
                           B.view());
    }
    gm->joinBranch("graphStream", "graphStreamBranch");
    CT::Kernel::launch(addArray, CT::Kernel::basic(A.shape().items(), "graphStream"), A.view(),
                       B.view());
    A.updateHost("graphStream");
    B.updateHost("graphStream");
    gm->launchHostFunction("graphStream", addNum, A.view(), 5);
 }
 uint32_t doGraphTest() {
    uint32_t failed = 0;
    CT::Array<uint32_t> A = CT::Array<uint32_t>::constant({1000000}, 50);
    CT::Array<uint32_t> B = CT::Array<uint32_t>::constant({1000000}, 0);
    CT::Manager::get()->addStream("graphStream");
    CT::Manager::get()->addStream("graphStreamBranch");
    CT::GraphManager gm;
    CT::Graph graph("graphStream", myGraph, &gm, A.view(), B.view());
    graph.execute().wait();
    uint32_t errors = 0;
    for (auto it = A.begin(); it != A.end(); ++it) {
        if (*it != 36) ++errors;
    }
    std::ostringstream msg;
    msg << "Errors: " << errors;
    TEST(errors == 0, "Graph", msg.str().c_str());
    return failed;
 }
 int main() {
    uint32_t failed = 0;
    std::cout << box("Macro Tests") << "\n";
@ -555,10 +491,7 @@ int main() {
    failed += doBLASTests<complex64>();
    failed += doBLASTests<complex128>();
-    std::cout << box("Stream/Graph Tests") << "\n";
+    constexpr uint32_t tests = 2 + 4 * 5 + 13 * 4;
    failed += doGraphTest();
    constexpr uint32_t tests = 2 + 4 * 5 + 13 * 4 + 1;
    std::ostringstream msg;
    msg << ((failed == 0) ? "\033[1;32mPASS \033[0m(" : "\033[1;31mFAIL \033[0m(")
        << (tests - failed) << "/" << tests << ")";