// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2015 // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: <eigen@codeplay.com> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #define EIGEN_HAS_CONSTEXPR 1 #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 753; const IndexType num_cols = 537; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; array<IndexType, 2> outRange = {{1, 1}}; Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange); Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange); in.setRandom(); auto dim = DSizes<IndexType, 2>(1, 1); full_redux = in.sum().reshape(dim); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate( sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, outRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); sycl_device.memcpyDeviceToHost( full_redux_gpu.data(), gpu_out_data, (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) << ", CPU FULL: " << full_redux(0, 0) << "\n"; VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_sum_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; const IndexType num_rows = 64; const IndexType num_cols = 64; array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array<IndexType, 2> tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; const IndexType offset = 64; TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.sum(); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap<scalar_tensor> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_max_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 4096; const IndexType num_cols = 4096; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 0, DataLayout, IndexType> full_redux; Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; in.setRandom(); full_redux = in.maximum(); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_max_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; const IndexType num_rows = 64; const IndexType num_cols = 64; array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array<IndexType, 2> tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set the initial value to be the max. // As we don't include this in the reduction the result should not be 2. in(0) = static_cast<DataType>(2); const IndexType offset = 64; TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.maximum(); VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap<scalar_tensor> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 4096; const IndexType num_cols = 4096; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; array<IndexType, 1> argRange = {{num_cols}}; Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; // red_axis[1]=1; Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange); Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange); Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange); Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange); Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange); Tensor<DataType, 0, DataLayout, IndexType> full_redux; Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; in.setRandom(); in_arg1.setRandom(); in_arg2.setRandom(); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate( in_arg1.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate( in_arg2.dimensions().TotalSize() * sizeof(DataType))); bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate( out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate( out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu( gpu_in_arg1_data, tensorRange); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu( gpu_in_arg2_data, tensorRange); TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu( gpu_out_arg_data, argRange); TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper( gpu_out_arg__gpu_helper_data, argRange); TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); // CPU VERSION out_arg_cpu = (in_arg1.argmax(1) == in_arg2.argmax(1)) .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); full_redux = (out_arg_cpu.template cast<float>()) .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); // GPU VERSION sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); sycl_device.memcpyHostToDevice( gpu_in_arg1_data, in_arg1.data(), (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); sycl_device.memcpyHostToDevice( gpu_in_arg2_data, in_arg2.data(), (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); out_Argout_gpu_helper.device(sycl_device) = (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); out_Argout_gpu.device(sycl_device) = (out_Argout_gpu_helper) .select(out_Argout_gpu.constant(true), out_Argout_gpu.constant(false)); out_gpu.device(sycl_device) = (out_Argout_gpu.template cast<float>()) .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() << '\n'; VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_in_arg1_data); sycl_device.deallocate(gpu_in_arg2_data); sycl_device.deallocate(gpu_out_arg__gpu_helper_data); sycl_device.deallocate(gpu_out_arg_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_mean_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; const IndexType num_rows = 64; const IndexType num_cols = 64; array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array<IndexType, 2> tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; const IndexType offset = 64; TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.mean(); VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap<scalar_tensor> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.mean(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_mean_with_odd_offset_sycl( const Eigen::SyclDevice& sycl_device) { // This is a particular case which illustrates a possible problem when the // number of local threads in a workgroup is even, but is not a power of two. using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>; using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; // 2177 = (17 * 128) + 1 gives rise to 18 local threads. // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. const IndexType n_elems = 8707; array<IndexType, 1> tensor_range = {{n_elems}}; data_tensor in(tensor_range); DataType full_redux; DataType full_redux_gpu; TensorMap<scalar_tensor> red_cpu(&full_redux); TensorMap<scalar_tensor> red_gpu(&full_redux_gpu); const DataType const_val = static_cast<DataType>(0.6391); in = in.constant(const_val); Eigen::IndexList<Eigen::type2index<0>> red_axis; red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); VERIFY_IS_APPROX(const_val, red_cpu()); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range); TensorMap<scalar_tensor> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu, full_redux); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_min_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 876; const IndexType num_cols = 953; array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 0, DataLayout, IndexType> full_redux; Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; in.setRandom(); full_redux = in.minimum(); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_full_reductions_min_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; const IndexType num_rows = 64; const IndexType num_cols = 64; array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array<IndexType, 2> tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set the initial value to be the min. // As we don't include this in the reduction the result should not be -2. in(0) = static_cast<DataType>(-2); const IndexType offset = 64; TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.minimum(); VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap<scalar_tensor> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_first_dim_reductions_max_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.maximum(red_axis); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_first_dim_reductions_max_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; const IndexType num_rows = 64; const IndexType num_cols = 64; array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; array<IndexType, 1> reduced_range = {{num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); const IndexType n_reduced = num_cols; data_tensor in(tensor_range); reduced_tensor redux; reduced_tensor redux_gpu(reduced_range); in.setRandom(); array<IndexType, 2> tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set maximum value outside of the considered range. for (IndexType i = 0; i < n_reduced; i++) { in(i) = static_cast<DataType>(2); } Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; const IndexType offset = 64; TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); redux = in_offset.maximum(red_axis); for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_NOT_EQUAL(redux(i), in(i)); } DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>( sycl_device.allocate(n_reduced * sizeof(DataType))); TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_APPROX(redux_gpu(i), redux(i)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_last_dim_reductions_max_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; const IndexType num_rows = 64; const IndexType num_cols = 64; array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; array<IndexType, 1> full_reduced_range = {{num_rows}}; array<IndexType, 1> reduced_range = {{num_rows - 1}}; const IndexType n_elems = internal::array_prod(tensor_range); const IndexType n_reduced = reduced_range[0]; data_tensor in(tensor_range); reduced_tensor redux(full_reduced_range); reduced_tensor redux_gpu(reduced_range); in.setRandom(); redux.setZero(); array<IndexType, 2> tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set maximum value outside of the considered range. for (IndexType i = 0; i < n_reduced; i++) { in(i) = static_cast<DataType>(2); } Eigen::array<IndexType, 1> red_axis; red_axis[0] = 1; const IndexType offset = 64; // Introduce an offset in both the input and the output. TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range); red_offset = in_offset.maximum(red_axis); // Check that the first value hasn't been changed and that the reduced values // are not equal to the previously set maximum in the input outside the range. VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0)); for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); } DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>( sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_APPROX(redux_gpu(i), red_offset(i)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_first_dim_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { array<IndexType, 2> tensorRange = {{dim_x, dim_y}}; Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; array<IndexType, 1> reduced_tensorRange = {{dim_y}}; Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange); Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.sum(red_axis); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < redux.size(); i++) { VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_first_dim_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.mean(red_axis); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.mean(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_last_dim_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 64; IndexType dim_y = 1; IndexType dim_z = 32; array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array<IndexType, 1> red_axis; red_axis[0] = 2; array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.mean(red_axis); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.mean(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_last_dim_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 64; IndexType dim_y = 1; IndexType dim_z = 32; array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array<IndexType, 1> red_axis; red_axis[0] = 2; array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.sum(red_axis); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange); TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_last_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device) { auto tensorRange = Sizes<64, 32>(64, 32); // auto red_axis = Sizes<0,1>(0,1); Eigen::IndexList<Eigen::type2index<1>> red_axis; auto reduced_tensorRange = Sizes<64>(64); TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; in_fix.setRandom(); redux_fix = in_fix.sum(red_axis); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( gpu_in_data, tensorRange); TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType)); out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu_fix.data(), gpu_out_data, redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, int DataLayout, typename IndexType> static void test_last_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { auto tensorRange = Sizes<64, 32>(64, 32); Eigen::IndexList<Eigen::type2index<1>> red_axis; auto reduced_tensorRange = Sizes<64>(64); TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; in_fix.setRandom(); redux_fix = in_fix.mean(red_axis); DataType* gpu_in_data = static_cast<DataType*>( sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( gpu_in_data, tensorRange); TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType)); out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu_fix.data(), gpu_out_data, redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); sycl_device.synchronize(); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } // SYCL supports a generic case of reduction where the accumulator is a // different type than the input data This is an example on how to get if a // Tensor contains nan and/or inf in one reduction template <typename InT, typename OutT> struct CustomReducer { static const bool PacketAccess = false; static const bool IsStateful = false; static constexpr OutT InfBit = 1; static constexpr OutT NanBit = 2; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, OutT* accum) const { if (Eigen::numext::isinf(x)) *accum |= InfBit; else if (Eigen::numext::isnan(x)) *accum |= NanBit; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, OutT* accum) const { *accum |= x; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { return OutT(0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { return accum; } }; template <typename DataType, typename AccumType, int DataLayout, typename IndexType> static void test_full_reductions_custom_sycl( const Eigen::SyclDevice& sycl_device) { constexpr IndexType InSize = 64; auto tensorRange = Sizes<InSize>(InSize); Eigen::IndexList<Eigen::type2index<0>> dims; auto reduced_tensorRange = Sizes<>(); TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix; TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix; CustomReducer<DataType, AccumType> reducer; in_fix.setRandom(); size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in_size_bytes)); AccumType* gpu_out_data = static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType))); TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix( gpu_in_data, tensorRange); TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, sizeof(AccumType)); VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template <typename DataType, typename Dev> void sycl_reduction_test_full_per_device(const Dev& sycl_device) { test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device); test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>( sycl_device); test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>( sycl_device); sycl_device.synchronize(); } template <typename DataType, typename Dev> void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>( sycl_device); test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>( sycl_device); test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>( sycl_device); test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>( sycl_device); test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>( sycl_device); test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( sycl_device); test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>( sycl_device); test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>( sycl_device); test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>( sycl_device); sycl_device.synchronize(); } template <typename DataType, typename Dev> void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device, 4197, 4097); test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 4197, 4097); test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 129, 8); test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( sycl_device); sycl_device.synchronize(); } template <typename DataType, typename Dev> void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( sycl_device); test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); sycl_device.synchronize(); } EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { for (const auto& device : Eigen::get_sycl_supported_devices()) { std::cout << "Running on " << device.template get_info<cl::sycl::info::device::name>() << std::endl; QueueInterface queueInterface(device); auto sycl_device = Eigen::SyclDevice(&queueInterface); CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device)); CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device)); CALL_SUBTEST_3( sycl_reduction_test_first_dim_per_device<float>(sycl_device)); CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device)); } }