CudaTools/include/Array.h

#ifndef CUDATOOLS_ARRAY_H
#define CUDATOOLS_ARRAY_H

#include "Core.h"
#include "Macros.h"
#include "Types.h"

#include <cmath>
#include <complex>
#include <cstdlib>
#include <iomanip>
#include <random>
#include <type_traits>

#ifdef CUDATOOLS_USE_EIGEN
#include <Eigen/Dense>
#endif

#ifdef CUDATOOLS_USE_PYTHON
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
namespace py = pybind11;
#endif

#ifdef DEVICE
#define POINTER pDevice
#else
#define POINTER pHost
#endif

using namespace CudaTools::Types;

namespace CudaTools {

#ifdef CUDATOOLS_USE_EIGEN
template <typename T>
using EigenMat = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
template <typename T> using EigenMapMat = Eigen::Map<EigenMat<T>>;
template <typename T> using ConstEigenMapMat = Eigen::Map<const EigenMat<T>>;

template <typename T> struct EigenAdaptConst_S { typedef EigenMapMat<T> type; };
template <typename T> struct EigenAdaptConst_S<const T> { typedef ConstEigenMapMat<T> type; };
template <typename T> using EigenAdaptConst = typename EigenAdaptConst_S<T>::type;
#endif

template <typename T> class Array;

struct Slice {
    uint32_t first;
    uint32_t second;
    HD Slice(const std::initializer_list<uint32_t> i)
        : first(*i.begin()), second(*(i.begin() + 1)) {}
};

template <typename T> class ArrayIterator {
  private:
    template <typename U>
    friend std::ostream& operator<<(std::ostream& out, const ArrayIterator<U>& it);
    T* pData;
    Shape mShape;
    uint32_t mIndices[CUDATOOLS_ARRAY_MAX_AXES] = {0};

  public:
    HD ArrayIterator(T* p, const Shape& shape) : pData(p), mShape(shape){};

    /**
     * Moves the iterator to the next value.
     */
    HD void next() {
        bool carry = false;
        uint32_t offset = 0;
        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) {
            if (mIndices[iAxis] == mShape.dim(iAxis) - 1) {
                mIndices[iAxis] = 0;
                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1);
                carry = true;
            } else {
                pData += mShape.stride(iAxis);
                mIndices[iAxis] += 1;
                carry = false;
            }

            if (not carry) {
                pData -= offset;
                return;
            }
        }
        pData += 1; // "Overflow" occured, so we reached end of array.
    }

    /**
     * Moves the iterator to the previous value.
     */
    HD void prev() {
        bool carry = false;
        uint32_t offset = 0;
        for (uint32_t iAxis = mShape.axes() - 1; iAxis < mShape.axes(); --iAxis) {
            if (mIndices[iAxis] == 0) {
                mIndices[iAxis] = mShape.dim(iAxis) - 1;
                offset += mShape.stride(iAxis) * (mShape.dim(iAxis) - 1);
                carry = true;
            } else {
                pData -= mShape.stride(iAxis);
                mIndices[iAxis] += 1;
                carry = false;
            }
            if (not carry) {
                pData += offset;
                return;
            }
        }
        pData -= 1;
    }

    /**
     * Moves the iterator a specified value away.
     * \param amount the amount to advance by
     */
    HD void advance(const int32_t amount) {
        if (amount < 0) {
            for (uint32_t i = 0; i < std::abs(amount); ++i) {
                prev();
            }
        } else {
            for (uint32_t i = 0; i < std::abs(amount); ++i) {
                next();
            }
        }
    }

    HD void operator++() { next(); }; /**< Prefix increment operator. */
    HD void operator--() { prev(); }; /**< Prefix decrement operator. */

    /**< Addition operator. */
    HD ArrayIterator<T> operator+(const int32_t v) const {
        ArrayIterator<T> it = *this;
        it.advance(v);
        return it;
    };

    /** Subtraction operator.*/
    HD ArrayIterator<T> operator-(const int32_t v) const {
        ArrayIterator<T> it = *this;
        it.advance(-v);
        return it;
    };
    HD void operator+=(const int32_t v) { advance(v); };
    HD void operator-=(const int32_t v) { advance(-v); };

    HD T& operator*() { return *pData; };             /**< Dereference operator. */
    HD const T& operator*() const { return *pData; }; /**< Const dereference operator. */

    /**
     * Equals operator.
     */
    HD bool operator==(const ArrayIterator<T>& it) { return pData == it.pData; }

    /**
     * Not equals operator.
     */
    HD bool operator!=(const ArrayIterator<T>& it) { return pData != it.pData; }
};

template <typename T> std::ostream& operator<<(std::ostream& out, const ArrayIterator<T>& it) {
    return out << it.pData;
}

template <typename T> class ArrayLoader {
  private:
    ArrayIterator<T> mIterator;
    ArrayIterator<T> mIteratorEnd;

  public:
    HD ArrayLoader(const ArrayIterator<T>& it, const ArrayIterator<T>& it_end)
        : mIterator(it), mIteratorEnd(it_end){};
    HD ArrayLoader &operator,(const T value) {
        CT_ERROR_IF(mIterator, ==, mIteratorEnd, "Cannot assign more values than Array size");
        *mIterator = value;
        ++mIterator;
        return *this;
    }
};

/**
 * A container that holds a N-dimensional array, stored column major. To set the
 * maximum N, there is a compiler macro CUDATOOLS_ARRAY_MAX_DIM whose default value is 4.
 * It adapts to operations between host and device to ease memory management.
 */
template <typename T> class Array {
  private:
    template <typename U> friend std::ostream& operator<<(std::ostream&, const Array<U>&);

    Shape mShape;
    T* pHost = nullptr;
    T* pDevice = nullptr;

    bool mIsView = false;
    bool mIsSlice = false;

    uint32_t mEndOffset = 0;

    HD void freeArrays() {
#ifndef DEVICE
        if (not mIsView) {
            if (pDevice != nullptr) CudaTools::free(pDevice);
            if (pHost != nullptr) delete[] pHost;
        }
#endif
    };

    HD void calcEnd() {
        uint32_t offset = 0;
        for (uint32_t i = 0; i < shape().axes(); ++i) {
            offset += (shape().dim(i) - 1) * shape().stride(i);
        }
        mEndOffset = offset + 1;
    };

  public:
    HD Array() = default;

    /**
     * Constructor for an Array that creates an allocates an array with
     * the specified Shape. Construction in this format is disabled on the device.
     * \brief Host only
     * \param shape the shape of the array
     * \param noDevice whether to initialize the array on the device
     */
    Array(const Shape& shape, const bool noDevice = false) : mShape(shape), mIsView(false) {
        pHost = new T[shape.items()];
        calcEnd();
        if (noDevice) return;
        pDevice = reinterpret_cast<T*>(CudaTools::malloc(shape.items() * sizeof(T)));
    };

    /**
     * Constructor for an Array from an existing (preallocated) pointer.
     * \param pointer the pointer to use
     * \param shape the shape of the array
     * \param noDevice whether to initialize the array on the device
     */
    HD Array(T* const pointer, const Shape& shape, const bool noDevice = false)
        : mShape(shape), mIsView(true), mIsSlice(false) {
        POINTER = pointer;
        calcEnd();
#ifndef DEVICE
        if (noDevice) return;
        pDevice = reinterpret_cast<T*>(CudaTools::malloc(shape.items() * sizeof(T)));
#endif
    };

    /**
     * Constructor for making a Array view from another Array,
     * given an offset and shape.
     * \param arr the original Array
     * \param shape the shape of the new array
     * \param offset the index where to start the a view of the array
     */
    HD Array(const Array& arr, const Shape& shape, const uint32_t offset = 0)
        : mShape(shape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(true),
          mIsSlice(arr.mIsSlice) {
        calcEnd();
        if (pHost != nullptr) pHost += offset;
        if (pDevice != nullptr) pDevice += offset;
    };

    /**
     * The copy-constructor for a Array. If this is not a view, a deep copy
     * of the data will be performed on both host and device. On the device, it is always
     * treated like a view.
     */
    HD Array(const Array& arr) : mShape(arr.mShape), mIsView(arr.mIsView), mIsSlice(arr.mIsSlice) {
        calcEnd();
        if (mIsView) { // If the other array was a view (and now this one), just assign.
            pHost = arr.pHost;
            pDevice = arr.pDevice;
            return;
        }

        // Otherwise, we assume this is needs to own data.
        pHost = new T[mShape.items()];
        auto arr_it = arr.begin();
        for (auto it = begin(); it != end(); ++it) {
            *it = *arr_it;
            ++arr_it;
        }

#ifndef DEVICE
        if (arr.pDevice != nullptr) {
            pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T));
        }
#endif
    };

    /**
     *  The move-constructor for a Array.
     */
    HD Array(Array&& arr)
        : mShape(arr.mShape), pHost(arr.pHost), pDevice(arr.pDevice), mIsView(arr.mIsView),
          mIsSlice(arr.mIsSlice) {
        calcEnd();
        // Make other object empty.
        arr.pHost = nullptr;
        arr.pDevice = nullptr;
        arr.mIsView = true;
    };

    HD ~Array() { freeArrays(); };

    /**
     * The copy-assignment operator for a Array. If this is not a view,
     * then the currently owned data will be freed, and a deep copy of the data will
     * be performed on both host and device. On the device, it is always treated like a view.
     */
    HD Array& operator=(const Array& arr) {
        if (this == &arr) return *this;

        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
            auto arr_it = arr.begin();
            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) {
                *it = *arr_it;
                ++arr_it;
            }
            return *this;
        }

        // Otherwise, it is implied to be object reassignment.
        mShape = arr.mShape;
        mIsView = arr.mIsView;
        mIsSlice = arr.mIsSlice;
        calcEnd();

        // Regardless if the right-hand side is a view, we create a new copy.
        // In case that the right-hand side is a view of this array, we
        // allocate memory to copy first. Keep in mind that the right-hand side
        // array will then become undefined.

        // We can only do this on the host.
#ifndef DEVICE
        T* new_pDevice = nullptr;
        if (pDevice != nullptr) {
            new_pDevice = (T*)CudaTools::malloc(mShape.items() * sizeof(T));
        }

        T* new_pHost = new T[mShape.items()];
        memcpy(new_pHost, arr.pHost, mShape.items() * sizeof(T));

        freeArrays();
        pHost = new_pHost;
        pDevice = new_pDevice;
#else
        pHost = arr.pHost;
        pDevice = arr.pDevice;
#endif
        return *this;
    };

    /**
     * The move-assignment operator for a Array.
     */
    HD Array& operator=(Array&& arr) {
        if (this == &arr) return *this;

        if (mIsView) { // If this array is a view, we assign data from the right-hand side.
            auto arr_it = arr.begin();
            for (auto it = begin(); it != end() and arr_it != arr.end(); ++it) {
                *it = *arr_it;
                ++arr_it;
            }
            return *this;
        }

        CT_ERROR(arr.mIsView,
                 "Cannot move-assign view to a non-view (owner). This would lead to undefined "
                 "behavior.");

        // Otherwise, it is implied to be object reassignment.
        freeArrays();
        mShape = arr.mShape;
        pHost = arr.pHost;
        pDevice = arr.pDevice;
        mIsView = arr.mIsView;
        mIsSlice = arr.mIsSlice;
        calcEnd();

        // Make other array empty.
        arr.pHost = nullptr;
        arr.pDevice = nullptr;
        arr.mIsView = true;
        return *this;
    };

    /**
     * Used for indexing the Array.
     * \param index index of the first dimension
     */
    HD Array operator[](const uint32_t index) const {
        CT_ERROR_IF(index, >=, shape().dim(0), "Index exceeds axis size");
        return Array(*this, shape().subshape(1), index * shape().stride(0));
    };

    /**
     * Used for indexing the Array.
     * \param indices a list of indices to index the Array
     */
    HD Array operator[](const std::initializer_list<uint32_t> indices) const {
        CT_ERROR_IF(indices.size(), >, shape().axes(),
                    "Number of indices cannot exceed number of axes");
        auto it = indices.begin();
        uint offset = 0;
        for (uint32_t i = 0; i < indices.size(); ++i) {
            uint32_t index = *it;
            CT_ERROR_IF(index, >=, shape().dim(i), "Index exceeds axis size");
            offset += index * shape().stride(i);
            ++it;
        }
        return Array(*this, shape().subshape(indices.size()), offset);
    };

    HD ArrayLoader<T> operator<<(const T value) {
        auto it = begin();
        *it = value;
        ++it;
        return ArrayLoader<T>(it, end());
    };

    HD T operator=(const T& value) { return POINTER[0] = value; };
    HD operator T&() { return POINTER[0]; };
    HD operator const T&() const { return POINTER[0]; };

    /**
     * Used to create slices of the Array.
     * \param slices a list of slices to slice the Array
     */
    HD Array slice(const std::initializer_list<Slice> slices) const {
        CT_ERROR_IF(slices.size(), >, shape().axes(),
                    "Number of slices cannot exceed number of axes");

        uint offset = 0;
        Shape new_shape = mShape;
        auto it = slices.begin();
        for (uint32_t i = 0; i < slices.size(); ++i) {
            uint32_t from_index = it->first;
            uint32_t to_index = it->second;
            CT_ERROR_IF(from_index, >, to_index,
                        "Slice start cannot be greater than than slice end");
            CT_ERROR_IF(from_index, >=, shape().dim(i), "Slice start exceeds axis size");
            CT_ERROR_IF(to_index - 1, >=, shape().dim(i), "Slice end exceeds axis size");

            offset += from_index * shape().stride(i);
            new_shape.mAxisDim[i] = to_index - from_index;
            ++it;
        }
        new_shape.mItems = 1;
        for (uint32_t i = 0; i < shape().axes(); ++i) {
            new_shape.mItems *= new_shape.dim(i);
        }

        Array<T> arr(*this, new_shape, offset);
        arr.mIsSlice = true;
        return arr;
    };

    /**
     * Returns this Array with a different Shape. Its self assigning version is reshape.
     * If this Array is a slice of another, then it will perform a deep copy, and return
     * a new non-view array.
     */
    HD Array reshaped(const Shape& new_shape) const {
        CT_ERROR_IF(shape().items(), !=, new_shape.items(),
                    "New shape cannot have a different number of terms");
        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try copy first)")
        Array<T> arr = view();
        arr.mShape = new_shape;
        return arr;
    };

    HD void reshape(const Shape& new_shape) {
        CT_ERROR_IF(shape().items(), !=, new_shape.items(),
                    "New shape cannot have a different number of terms");
        CT_ERROR(mIsSlice, "Cannot reshape slice, a new array must be made. (Try copy first)")
        mShape = new_shape;
    };

    /**
     * Gets a view that is has at least two dimensions. Useful for promoting
     * single vectors to their 2D counterparts.
     */
    HD Array atLeast2D() const {
        return (shape().axes() == 1) ? reshaped({shape().length(), 1}) : view();
    };

    /**
     * Reshapes this array, making it at least 2D. Useful for promoting
     * single vectors to their 2D counterparts.
     */
    HD void asAtLeast2D() {
        if (shape().axes() == 1) reshape({shape().length(), 1});
    };

    /**
     * Returns a view of this Array that has been flattened into one dimension.
     */
    HD Array flattened() const { return reshaped({mShape.mItems}); };

    /**
     * Flattens this Array into one dimension.
     */
    HD void flatten() { reshape({mShape.mItems}); };

#ifdef CUDATOOLS_USE_EIGEN
    /**
     * Returns the Eigen::Map of this Array.
     */
    EigenAdaptConst<ComplexConversion<T>> eigenMap() const {
        uint32_t total_dim = mShape.mAxes;
        CT_ERROR(mIsSlice, "Mapping to an Eigen array cannot occur on slices")
        CT_ERROR_IF(total_dim, !=, 2,
                    "Mapping to an Eigen array can only occur on two-dimensional arrays");
        return EigenAdaptConst<ComplexConversion<T>>((ComplexConversion<T>*)POINTER, mShape.rows(),
                                                     mShape.cols());
    };
#endif

    /**
     * Gets the Shape of the Array.
     */
    HD Shape shape() const { return mShape; };

    /**
     * Gets the pointer to this array, depending on host or device.
     */
    HD ComplexConversion<T>* data() const { return (ComplexConversion<T>*)POINTER; };

    /**
     * Returns the device pointer regardless of host or device.
     */
    HD T* dataDevice() const { return pDevice; };

    HD bool isView() const { return mIsView; };   /**< Gets whether this Array is a view. */
    HD bool isSlice() const { return mIsSlice; }; /**< Gets whether this Array is a slice. */

    /**
     * Gets a view of this Array.
     */
    HD Array view() const { return Array(*this, mShape); }

    /**
     * Copies this Array and returns a new Array with the same memory.
     */
    Array copy() const {
        Array<T> arr(mShape, (pDevice == nullptr));

        auto arr_it = arr.begin();
        for (auto it = begin(); it != end(); ++it) {
            *arr_it = *it;
            ++arr_it;
        }
#ifndef DEVICE
        if (pDevice != nullptr) {
            CudaTools::copy(pDevice, arr.dataDevice(), mShape.items() * sizeof(T)).wait();
        }
#endif
        return arr;
    };

    /**
     * Gets the iterator to the beginning of this Array.
     */
    HD ArrayIterator<T> begin() const { return ArrayIterator<T>(POINTER, mShape); };

    /**
     * Gets the iterator to the end of this Array.
     */
    HD ArrayIterator<T> end() const { return ArrayIterator<T>(POINTER + mEndOffset, mShape); };

    /**
     * Sets the values of the entire Array to a constant. This is restricted to numerical types.
     */
    HD void setConstant(const T value) const {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = value;
        }
    };

    /**
     * Sets the Array values with uniform random values in a specified range. This is restricted to
     * numerical types.
     * \brief Host only
     */
    void setRandom(const T min, const T max) const {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        if constexpr (is_complex<T>) {
            CT_ERROR_IF(max.real(), <, min.real(),
                        "Upper bound of range cannot be larger than lower bound");
            CT_ERROR_IF(max.imag(), <, min.imag(),
                        "Upper bound of range cannot be larger than lower bound");
        } else {
            CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        }
        std::random_device rd;
        std::mt19937 mt(rd());
        if constexpr (is_int<T>) {
            std::uniform_int_distribution<T> dist(min, max);
            for (auto it = begin(); it != end(); ++it) {
                *it = dist(mt);
            }
        } else if constexpr (is_float<T>) {
            std::uniform_real_distribution<T> dist(min, max);
            for (auto it = begin(); it != end(); ++it) {
                *it = dist(mt);
            }
        } else if constexpr (is_complex<T>) {
            std::uniform_real_distribution<ComplexUnderlying<T>> distr(min.real(), max.real());
            std::uniform_real_distribution<ComplexUnderlying<T>> disti(min.imag(), max.imag());
            for (auto it = begin(); it != end(); ++it) {
                *it = T(distr(mt), disti(mt));
            }
        }
    };

    /**
     * Sets the Array values to start from a value and increment by a specified step. This is
     * restricted to numerical types.
     */
    HD void setRange(T min, const T step = 1) const {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        for (auto it = begin(); it != end(); ++it) {
            *it = min;
            min += step;
        }
    }
    /**
     * Sets the Array values to be evenly spaced numbers over a given interval. This is restricted
     * to floating point types.
     */
    HD void setLinspace(const T min, const T max) const {
        static_assert(is_float<T>, "Function only available on numeric floating types.");
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        T i = 0;
        T d = max - min;
        T items = (T)(shape().items() - 1);
        for (auto it = begin(); it != end(); ++it) {
            *it = min + d * (i / items);
            i += 1;
        }
    };

    /**
     * Returns array of given shape with constant values. This is restricted to numerical types.
     * \brief Host only
     */
    static Array constant(const Shape& shape, const T value) {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        Array<T> arr(shape);
        arr.setConstant(value);
        return arr;
    };

    /**
     * Returns array of given shape with random values in given interval. This is restricted to
     * numerical types.
     * \brief Host only
     */
    static Array random(const Shape& shape, const T min, const T max) {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        Array<T> arr(shape);
        arr.setRandom(min, max);
        return arr;
    };

    /**
     * Returns evenly spaced values within a given interval. This is restricted to numerical types.
     * \brief Host only
     */
    static Array range(const T min, const T max, const T step = 1) {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(max, <, min, "Upper bound of range cannot be larger than lower bound");
        Array<T> arr({(uint32_t)((max - min) / step)});
        arr.setRange(min, step);
        return arr;
    }

    /**
     * Returns evenly spaced values within a given interval. This is restricted to floating point
     * types.
     * \brief Host only
     */
    static Array linspace(const T min, const T max, const uint32_t size) {
        static_assert(is_float<T>, "Function only available on numeric floating types.");
        Array<T> arr({size});
        arr.setLinspace(min, max);
        return arr;
    }

    /**
     * Transposes the internal data and returns the corresponding new Array.
     * Its self assigning version is transpose. This is restricted to numerical types.
     * \brief Host only
     */

#ifdef CUDATOOLS_USE_EIGEN
    Array transposed() const {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr({mShape.rows(), mShape.cols()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
        return new_arr;
    };

    /**
     * Transposes the intenal data. Its self assigning version is transpose.
     * This is restricted to numerical types.
     * \brief Host only
     */
    void transpose() {
        static_assert(is_host_num<T>, "Function only available on host-compatible numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Tranpose can only occur on two-dimensional arrays");
        Array<T> new_arr(*this, {mShape.cols(), mShape.rows()});
        new_arr.eigenMap() = this->eigenMap().transpose().eval();
        mShape = Shape({mShape.cols(), mShape.rows()});
    };

    void inverse() const {
        static_assert(is_float<T>, "Function only available on floating numeric types.");
        CT_ERROR_IF(shape().axes(), !=, 2, "Inverse can only occur on two-dimensional arrays");
        CT_ERROR_IF(shape().rows(), !=, shape().cols(),
                    "Inverse can only occur on square matrices");
        Array<T> inv(shape());
        inv.eigenMap() = this->eigenMap().inverse();
    };
#endif

    /**
     * Pins the memory (page locks) for faster memory transfer in concurrent
     * transfers.
     * \brief Host only
     */
    void pinMemory() const { CudaTools::pin(pHost, mShape.items() * sizeof(T)); };

    /**
     * Updates the host copy by copying the device data back to the host.
     * \brief Host only
     */
    StreamID updateHost(const StreamID& stream = DEF_MEM_STREAM) const {
        CT_ERROR(mIsSlice, "Cannot update host copy on a slice");
        return CudaTools::copy(pDevice, pHost, mShape.items() * sizeof(T), stream);
    };

    /**
     * Updates the device copy by copying the host data to the device.
     * \brief Host only
     */
    StreamID updateDevice(const StreamID& stream = DEF_MEM_STREAM) const {
        CT_ERROR(mIsSlice, "Cannot update device copy on a slice");
        return CudaTools::copy(pHost, pDevice, mShape.items() * sizeof(T), stream);
    };

#ifdef CUDATOOLS_USE_PYTHON
    /**
     * Returns a py::array for making an Array available as a Python numpy array.
     */
    py::array pyArray() const {
        std::vector<py::ssize_t> dims, strides;
        for (uint iAxis = 0; iAxis < mShape.axes(); ++iAxis) {
            dims.push_back(static_cast<py::ssize_t>(mShape.dim(iAxis)));
            strides.push_back(sizeof(T) * static_cast<py::ssize_t>(mShape.stride(iAxis)));
        }
        return py::array_t<T, py::array::f_style>(
            py::buffer_info((void*)pHost, sizeof(T), py::format_descriptor<T>::format(),
                            static_cast<py::ssize_t>(mShape.axes()), dims, strides));
    };
#endif
};

template <typename T>
void printAxis(std::ostream& out, const Array<T>& arr, const uint32_t axis, size_t width) {
    std::string space = std::string(2 * axis, ' ');
    if (arr.shape().axes() == 1) {
        out << "[";
        for (uint32_t i = 0; i < arr.shape().items(); ++i) {
            if constexpr (std::is_floating_point<T>::value) {
                out << std::scientific << std::setprecision(6);
            }
            if (width == 0) {
                out << ((i == 0) ? "" : " ");
            } else {
                out << std::setw((i == 0) ? width - 1 : width);
            }
            out << static_cast<T>(arr[i]) << ((i == arr.shape().items() - 1) ? "]" : ",");
        }
    } else if (arr.shape().axes() == 2) {
        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) {
            out << space << ((i == 0) ? "[" : " ");
            printAxis(out, arr[i], axis + 1, width);
            out << ((i == arr.shape().dim(0) - 1) ? "]" : ",\n");
        }
    } else {
        out << space << "[\n";
        for (uint32_t i = 0; i < arr.shape().dim(0); ++i) {
            printAxis(out, arr[i], axis + 1, width);
            out << ((i == arr.shape().dim(0) - 1) ? "\n" : ",\n\n");
        }
        out << space << "]";
    }
}

template <typename T> std::ostream& operator<<(std::ostream& out, const Array<T>& arr) {
    size_t width = 0;
    if constexpr (is_int<T>) {
        T max_val = 0;
        bool negative = false;
        for (auto it = arr.begin(); it != arr.end(); ++it) {
            T val = *it;
            if constexpr (not std::is_unsigned<T>::value) {
                if (*it < 0) {
                    negative = true;
                    val *= -1;
                }
            }
            max_val = (val > max_val) ? val : max_val;
        }
        width = std::to_string(max_val).size() + 1;
        width += (negative) ? 1 : 0;
    } else if constexpr (is_float<T>) {
        T max_val = 0;
        bool negative = false;
        for (auto it = arr.begin(); it != arr.end(); ++it) {
            if (*it < 0) negative = true;
            int exp = 0;
            frexp(*it, &exp);
            max_val = (exp > max_val) ? exp : max_val;
        }
        width = std::to_string(max_val).size() + 5;
        width += (negative) ? 1 : 0;
    }

    printAxis<T>(out, arr, 0, (arr.shape().axes() == 1) ? 0 : width);
    return out;
}

}; // namespace CudaTools

#endif // ARRAY_H