From 2b6753cc7c42657468ac138a237aeabdda35557e Mon Sep 17 00:00:00 2001
From: Kenneth Jao <ksjao2@illinois.edu>
Date: Fri, 21 Apr 2023 15:39:14 -0500
Subject: [PATCH] Corrected examples for Array

---
 Array.h                              | 26 ++++++++----
 docs/source/usage.rst                | 61 ++++++++++++++++++----------
 samples/3_ArrayKernel/main.cu.cpp    | 45 ++++++++++++--------
 samples/4_ArrayFunctions/Makefile    | 25 +-----------
 samples/4_ArrayFunctions/main.cu.cpp | 17 ++++----
 5 files changed, 95 insertions(+), 79 deletions(-)
diff --git a/Array.h b/Array.h
index 69b9a06..4efe07e 100644
--- a/Array.h
+++ b/Array.h
@@ -450,10 +450,7 @@ template <typename T> class Array {
     HD Array reshaped(const Shape& new_shape) const {
         CT_ERROR_IF(shape().items(), !=, new_shape.items(),
                     "New shape cannot have a different number of terms");
-        if (mIsSlice) {
-            Array<T> arr = this->copy();
-            return arr.reshaped(new_shape);
-        }
+        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try copy first)")
         Array<T> arr = view();
         arr.mShape = new_shape;
         return arr;
@@ -462,7 +459,7 @@ template <typename T> class Array {
     HD void reshape(const Shape& new_shape) {
         CT_ERROR_IF(shape().items(), !=, new_shape.items(),
                     "New shape cannot have a different number of terms");
-        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try reshaped instead)")
+        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try copy first)")
         mShape = new_shape;
     };
 
@@ -471,13 +468,26 @@ template <typename T> class Array {
      * single vectors to their 2D counterparts.
      */
     HD Array atLeast2D() const {
-        return (shape().axes() == 1) ? Array(*this, {shape().length(), 1}) : view();
+        return (shape().axes() == 1) ? reshaped({shape().length(), 1}) : view();
     };
 
     /**
-     * Flattens the Array into one dimension.
+     * Reshapes this array, making it at least 2D. Useful for promoting
+     * single vectors to their 2D counterparts.
+     */
+    HD void asAtLeast2D() {
+        if (shape().axes() == 1) reshape({shape().length(), 1});
+    };
+
+    /**
+     * Returns a view of this Array that has been flattened into one dimension.
+     */
+    HD Array flattened() const { return reshaped({mShape.mItems}); };
+
+    /**
+     * Flattens this Array into one dimension.
      */
-    HD Array flatten() const { return reshape({mShape.mItems}); };
+    HD void flatten() { reshape({mShape.mItems}); };
 
     /**
      * Returns the Eigen::Map of this Array.
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 13cb6cc..009bba5 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -136,38 +136,53 @@ We can demonstrate a few here.
 
 .. code-block:: cpp
 
-    DEFINE_KERNEL(times2, const CudaTools::Array<int>& arr) {
-        BASIC_LOOP(arr.shape().items()) {
-            arr[iThread] *= 2;
-        }
+    DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
+        CudaTools::Array<int> flat = arr.flattened();
+        BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
+    }
+
+    DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
+        CudaTools::Array<double> flat = arr.flattened();
+        BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
     }
 
     int main() {
         CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(0, 10);
-        CudaTools::Array<int> arrConst = CudaTools::Array<int>::constant(1);
-        CudaTools::Array<double> arrLinspace = CudaTools::Array<int>::linspace(0, 5, 10);
+        CudaTools::Array<int> arrConst = CudaTools::Array<int>::constant({10}, 1);
+        CudaTools::Array<double> arrLinspace = CudaTools::Array<double>::linspace(0, 5, 10);
         CudaTools::Array<int> arrComma({2, 2}); // 2x2 array.
-        arrComma << 1, 2, 3, 4; // Comma initializer if needed.
-        std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n";
+        arrComma << 1, 2, 3, 4;                 // Comma initializer if needed.
+
+        arrRange.updateDevice();
+        arrConst.updateDevice();
+        arrLinspace.updateDevice();
+        arrComma.updateDevice().wait();
+
+        std::cout << "Before Kernel:\n";
+        std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma << "\n";
 
         // Call the kernel multiple times asynchronously. Note: since they share same
         // stream, they are not run in parallel, just queued on the device.
-        KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange);
-        KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrRange);
-        KERNEL(times2, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrRange).wait();
-        KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrRange).wait();
+        // NOTE: Notice that a view is passed into the kernel, not the Array itself.
+        KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
+        KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
+        KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
+        KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
         arrRange.updateHost();
         arrConst.updateHost();
         arrLinspace.updateHost();
-        arrComma.updateHost().wait(); // Only need to wait for the last one, since they have the same stream.
+        arrComma.updateHost().wait(); // Same stream, so you should wait for the last call.
 
-        std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n";
+        std::cout << "After Kernel:\n";
+        std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma << "\n";
         return 0;
     }
 
 In this example, we show a few ways to initialize an ``Array`` through some static functions.
 It is templated, so it can (theoretically) support any type. Additionally, you can initialize an
-empty ``Array`` by providing its ``Shape`` with an initializer list (ex: ``{2, 2}``). For more details,
+empty ``Array`` by providing its ``Shape`` with an initializer list (ex: ``{2, 2}``). Many of these
+array functions and initializers have view-returning and self-assigning versions. For instance,
+``.flattened()`` returns a flattened view of an Array, and does not modify the original. For more details,
 see :ref:`here <CudaTools::Array<T>>`.
 
 We also note the use of ``BASIC_LOOP(N)``, which is a macro for generating the loop automatically
@@ -175,28 +190,32 @@ on the kernel given the number of threads. It is intended to be used only for "e
 situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If compiling for CPU, it will
 mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism.
 
+.. warning::
+   Notice that a view must be passed to the kernel, and not the original object. This
+
 The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and
 a few other functions.
 
 .. code-block:: cpp
 
     int main() {
-        CudaTools::Array<int> arr = CudaTools::Array<int>::constant(0);
+        CudaTools::Array<int> arr = CudaTools::Array<int>::constant({100}, 0);
         arr.reshape({4, 5, 5}); // Creates a three dimensional array.
 
-        arr[0][0][0] = 1; // Axis by axis indexing.
+        arr[0][0][0] = 1;     // Axis by axis indexing.
         arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing.
         std::cout << arr << "\n";
 
-        CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(18);
-        auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center.
+        CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(0, 18);
+        auto arrSlice = arr.slice({{1, 3}, {1, 4}, {1, 4}}); // Takes a slice of the center.
         std::cout << "Before Copy:\n" << arrSlice << "\n";
         arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!)
         std::cout << "After Copy:\n" << arrSlice << "\n";
 
-        std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy.
+        std::cout << "Modified: \n"
+                  << arr << "\n"; // The original array is modified, since a slice does not copy.
 
-        CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array.
+        CudaTools::Array<int> newArr = arr.copy();                 // Copies the original Array.
         for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array.
             *it = 1;
         }
diff --git a/samples/3_ArrayKernel/main.cu.cpp b/samples/3_ArrayKernel/main.cu.cpp
index adc4ed0..05f5a3c 100644
--- a/samples/3_ArrayKernel/main.cu.cpp
+++ b/samples/3_ArrayKernel/main.cu.cpp
@@ -1,34 +1,45 @@
 #define CUDATOOLS_IMPLEMENTATION
-#include <Core.h>
 #include <Array.h>
+#include <Core.h>
+
+DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) {
+    CudaTools::Array<int> flat = arr.flattened();
+    BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
+}
 
-DEFINE_KERNEL(times2, const CudaTools::Array<int>& arr) {
-    BASIC_LOOP(arr.shape().items()) {
-        arr[iThread] *= 2;
-    }
+DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) {
+    CudaTools::Array<double> flat = arr.flattened();
+    BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; }
 }
 
 int main() {
     CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(0, 10);
-    CudaTools::Array<int> arrConst = CudaTools::Array<int>::constant(1);
-    CudaTools::Array<double> arrLinspace = CudaTools::Array<int>::linspace(0, 5, 10);
+    CudaTools::Array<int> arrConst = CudaTools::Array<int>::constant({10}, 1);
+    CudaTools::Array<double> arrLinspace = CudaTools::Array<double>::linspace(0, 5, 10);
     CudaTools::Array<int> arrComma({2, 2}); // 2x2 array.
-    arrComma << 1, 2, 3, 4; // Comma initializer if needed.
-    std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n";
+    arrComma << 1, 2, 3, 4;                 // Comma initializer if needed.
+
+    arrRange.updateDevice();
+    arrConst.updateDevice();
+    arrLinspace.updateDevice();
+    arrComma.updateDevice().wait();
+
+    std::cout << "Before Kernel:\n";
+    std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma << "\n";
 
     // Call the kernel multiple times asynchronously. Note: since they share same
     // stream, they are not run in parallel, just queued on the device.
-    KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange);
-    KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrRange);
-    KERNEL(times2, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrRange).wait();
-    KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrRange).wait();
+    // NOTE: Notice that a view is passed into the kernel, not the Array itself.
+    KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view());
+    KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view());
+    KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view());
+    KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait();
     arrRange.updateHost();
     arrConst.updateHost();
     arrLinspace.updateHost();
-    arrComma.updateHost().wait(); // Only need to wait for the last one, since they have the same stream.
+    arrComma.updateHost().wait(); // Same stream, so you should wait for the last call.
 
-    std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n";
+    std::cout << "After Kernel:\n";
+    std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma << "\n";
     return 0;
 }
-
-
diff --git a/samples/4_ArrayFunctions/Makefile b/samples/4_ArrayFunctions/Makefile
index d0486ce..501a3ac 100644
--- a/samples/4_ArrayFunctions/Makefile
+++ b/samples/4_ArrayFunctions/Makefile
@@ -14,30 +14,7 @@ SRC_DIR = .
 BUILD_DIR = build
 
 # Should not need to modify below.
-int main() {
-    CudaTools::Array<int> arr = CudaTools::Array<int>::constant(0);
-    arr.reshape({4, 5, 5}); // Creates a three dimensional array.
-
-    arr[0][0][0] = 1; // Axis by axis indexing.
-    arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing.
-    std::cout << arr << "\n";
-
-    CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(18);
-    auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center.
-    std::cout << "Before Copy:\n" << arrSlice << "\n";
-    arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!)
-    std::cout << "After Copy:\n" << arrSlice << "\n";
-
-    std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy.
-
-    CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array.
-    for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array.
-        *it = 1;
-    }
-    std::cout << "Modified New Array:\n" << newArr << "\n";
-    std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy.
-    return 0;
-}
+
 CPU_BUILD_DIR = $(BUILD_DIR)/cpu
 GPU_BUILD_DIR = $(BUILD_DIR)/gpu
 
diff --git a/samples/4_ArrayFunctions/main.cu.cpp b/samples/4_ArrayFunctions/main.cu.cpp
index 3979f3c..806329c 100644
--- a/samples/4_ArrayFunctions/main.cu.cpp
+++ b/samples/4_ArrayFunctions/main.cu.cpp
@@ -1,24 +1,25 @@
 #define CUDATOOLS_IMPLEMENTATION
-#include <Core.h>
 #include <Array.h>
+#include <Core.h>
 
 int main() {
-    CudaTools::Array<int> arr = CudaTools::Array<int>::constant(0);
+    CudaTools::Array<int> arr = CudaTools::Array<int>::constant({100}, 0);
     arr.reshape({4, 5, 5}); // Creates a three dimensional array.
 
-    arr[0][0][0] = 1; // Axis by axis indexing.
+    arr[0][0][0] = 1;     // Axis by axis indexing.
     arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing.
     std::cout << arr << "\n";
 
-    CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(18);
-    auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center.
+    CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(0, 18);
+    auto arrSlice = arr.slice({{1, 3}, {1, 4}, {1, 4}}); // Takes a slice of the center.
     std::cout << "Before Copy:\n" << arrSlice << "\n";
     arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!)
     std::cout << "After Copy:\n" << arrSlice << "\n";
 
-    std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy.
+    std::cout << "Modified: \n"
+              << arr << "\n"; // The original array is modified, since a slice does not copy.
 
-    CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array.
+    CudaTools::Array<int> newArr = arr.copy();                 // Copies the original Array.
     for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array.
         *it = 1;
     }
@@ -26,5 +27,3 @@ int main() {
     std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy.
     return 0;
 }
-
-