|
|
|
@ -136,38 +136,53 @@ We can demonstrate a few here. |
|
|
|
|
|
|
|
|
|
.. code-block:: cpp |
|
|
|
|
|
|
|
|
|
DEFINE_KERNEL(times2, const CudaTools::Array<int>& arr) { |
|
|
|
|
BASIC_LOOP(arr.shape().items()) { |
|
|
|
|
arr[iThread] *= 2; |
|
|
|
|
} |
|
|
|
|
DEFINE_KERNEL(times2, const CudaTools::Array<int> arr) { |
|
|
|
|
CudaTools::Array<int> flat = arr.flattened(); |
|
|
|
|
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; } |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
DEFINE_KERNEL(times2double, const CudaTools::Array<double> arr) { |
|
|
|
|
CudaTools::Array<double> flat = arr.flattened(); |
|
|
|
|
BASIC_LOOP(arr.shape().items()) { flat[iThread] *= 2; } |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int main() { |
|
|
|
|
CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(0, 10); |
|
|
|
|
CudaTools::Array<int> arrConst = CudaTools::Array<int>::constant(1); |
|
|
|
|
CudaTools::Array<double> arrLinspace = CudaTools::Array<int>::linspace(0, 5, 10); |
|
|
|
|
CudaTools::Array<int> arrConst = CudaTools::Array<int>::constant({10}, 1); |
|
|
|
|
CudaTools::Array<double> arrLinspace = CudaTools::Array<double>::linspace(0, 5, 10); |
|
|
|
|
CudaTools::Array<int> arrComma({2, 2}); // 2x2 array. |
|
|
|
|
arrComma << 1, 2, 3, 4; // Comma initializer if needed. |
|
|
|
|
std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; |
|
|
|
|
arrComma << 1, 2, 3, 4; // Comma initializer if needed. |
|
|
|
|
|
|
|
|
|
arrRange.updateDevice(); |
|
|
|
|
arrConst.updateDevice(); |
|
|
|
|
arrLinspace.updateDevice(); |
|
|
|
|
arrComma.updateDevice().wait(); |
|
|
|
|
|
|
|
|
|
std::cout << "Before Kernel:\n"; |
|
|
|
|
std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma << "\n"; |
|
|
|
|
|
|
|
|
|
// Call the kernel multiple times asynchronously. Note: since they share same |
|
|
|
|
// stream, they are not run in parallel, just queued on the device. |
|
|
|
|
KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange); |
|
|
|
|
KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrRange); |
|
|
|
|
KERNEL(times2, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrRange).wait(); |
|
|
|
|
KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrRange).wait(); |
|
|
|
|
// NOTE: Notice that a view is passed into the kernel, not the Array itself. |
|
|
|
|
KERNEL(times2, CudaTools::Kernel::basic(arrRange.shape().items()), arrRange.view()); |
|
|
|
|
KERNEL(times2, CudaTools::Kernel::basic(arrConst.shape().items()), arrConst.view()); |
|
|
|
|
KERNEL(times2double, CudaTools::Kernel::basic(arrLinspace.shape().items()), arrLinspace.view()); |
|
|
|
|
KERNEL(times2, CudaTools::Kernel::basic(arrComma.shape().items()), arrComma.view()).wait(); |
|
|
|
|
arrRange.updateHost(); |
|
|
|
|
arrConst.updateHost(); |
|
|
|
|
arrLinspace.updateHost(); |
|
|
|
|
arrComma.updateHost().wait(); // Only need to wait for the last one, since they have the same stream. |
|
|
|
|
arrComma.updateHost().wait(); // Same stream, so you should wait for the last call. |
|
|
|
|
|
|
|
|
|
std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma "\n"; |
|
|
|
|
std::cout << "After Kernel:\n"; |
|
|
|
|
std::cout << arrRange << "\n" << arrConst << "\n" << arrLinspace << "\n" << arrComma << "\n"; |
|
|
|
|
return 0; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
In this example, we show a few ways to initialize an ``Array`` through some static functions. |
|
|
|
|
It is templated, so it can (theoretically) support any type. Additionally, you can initialize an |
|
|
|
|
empty ``Array`` by providing its ``Shape`` with an initializer list (ex: ``{2, 2}``). For more details, |
|
|
|
|
empty ``Array`` by providing its ``Shape`` with an initializer list (ex: ``{2, 2}``). Many of these |
|
|
|
|
array functions and initializers have view-returning and self-assigning versions. For instance, |
|
|
|
|
``.flattened()`` returns a flattened view of an Array, and does not modify the original. For more details, |
|
|
|
|
see :ref:`here <CudaTools::Array<T>>`. |
|
|
|
|
|
|
|
|
|
We also note the use of ``BASIC_LOOP(N)``, which is a macro for generating the loop automatically |
|
|
|
@ -175,28 +190,32 @@ on the kernel given the number of threads. It is intended to be used only for "e |
|
|
|
|
situations and with the ``CudaTools::Kernel::basic()`` launch parameters. If compiling for CPU, it will |
|
|
|
|
mark the loop with ``#pragma parallel for`` and attempt to use OpenMP for parallelism. |
|
|
|
|
|
|
|
|
|
.. warning:: |
|
|
|
|
Notice that a view must be passed to the kernel, and not the original object. This |
|
|
|
|
|
|
|
|
|
The Array also supports other helpful functions, such as multi-dimensional indexing, slicing, and |
|
|
|
|
a few other functions. |
|
|
|
|
|
|
|
|
|
.. code-block:: cpp |
|
|
|
|
|
|
|
|
|
int main() { |
|
|
|
|
CudaTools::Array<int> arr = CudaTools::Array<int>::constant(0); |
|
|
|
|
CudaTools::Array<int> arr = CudaTools::Array<int>::constant({100}, 0); |
|
|
|
|
arr.reshape({4, 5, 5}); // Creates a three dimensional array. |
|
|
|
|
|
|
|
|
|
arr[0][0][0] = 1; // Axis by axis indexing. |
|
|
|
|
arr[0][0][0] = 1; // Axis by axis indexing. |
|
|
|
|
arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing. |
|
|
|
|
std::cout << arr << "\n"; |
|
|
|
|
|
|
|
|
|
CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(18); |
|
|
|
|
auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center. |
|
|
|
|
CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(0, 18); |
|
|
|
|
auto arrSlice = arr.slice({{1, 3}, {1, 4}, {1, 4}}); // Takes a slice of the center. |
|
|
|
|
std::cout << "Before Copy:\n" << arrSlice << "\n"; |
|
|
|
|
arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!) |
|
|
|
|
std::cout << "After Copy:\n" << arrSlice << "\n"; |
|
|
|
|
|
|
|
|
|
std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy. |
|
|
|
|
std::cout << "Modified: \n" |
|
|
|
|
<< arr << "\n"; // The original array is modified, since a slice does not copy. |
|
|
|
|
|
|
|
|
|
CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array. |
|
|
|
|
CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array. |
|
|
|
|
for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array. |
|
|
|
|
*it = 1; |
|
|
|
|
} |
|
|
|
|