// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2015 // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #define EIGEN_HAS_CONSTEXPR 1 #include "main.h" #include template static void test_full_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 753; const IndexType num_cols = 537; array tensorRange = {{num_rows, num_cols}}; array outRange = {{1, 1}}; Tensor in(tensorRange); Tensor full_redux(outRange); Tensor full_redux_gpu(outRange); in.setRandom(); auto dim = DSizes(1, 1); full_redux = in.sum().reshape(dim); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate( sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu(gpu_out_data, outRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); sycl_device.memcpyDeviceToHost( full_redux_gpu.data(), gpu_out_data, (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) << ", CPU FULL: " << full_redux(0, 0) << "\n"; VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_sum_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor; using scalar_tensor = Tensor; const IndexType num_rows = 64; const IndexType num_cols = 64; array tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; const IndexType offset = 64; TensorMap in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.sum(); DataType* gpu_in_data = static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(sizeof(DataType))); TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_max_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 4096; const IndexType num_cols = 4096; array tensorRange = {{num_rows, num_cols}}; Tensor in(tensorRange); Tensor full_redux; Tensor full_redux_gpu; in.setRandom(); full_redux = in.maximum(); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_max_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor; using scalar_tensor = Tensor; const IndexType num_rows = 64; const IndexType num_cols = 64; array tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set the initial value to be the max. // As we don't include this in the reduction the result should not be 2. in(0) = static_cast(2); const IndexType offset = 64; TensorMap in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.maximum(); VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); DataType* gpu_in_data = static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(sizeof(DataType))); TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 4096; const IndexType num_cols = 4096; array tensorRange = {{num_rows, num_cols}}; array argRange = {{num_cols}}; Eigen::array red_axis; red_axis[0] = 0; // red_axis[1]=1; Tensor in(tensorRange); Tensor in_arg1(tensorRange); Tensor in_arg2(tensorRange); Tensor out_arg_cpu(argRange); Tensor out_arg_gpu(argRange); Tensor out_arg_gpu_helper(argRange); Tensor full_redux; Tensor full_redux_gpu; in.setRandom(); in_arg1.setRandom(); in_arg2.setRandom(); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_in_arg1_data = static_cast(sycl_device.allocate( in_arg1.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_in_arg2_data = static_cast(sycl_device.allocate( in_arg2.dimensions().TotalSize() * sizeof(DataType))); bool* gpu_out_arg__gpu_helper_data = static_cast(sycl_device.allocate( out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); bool* gpu_out_arg_data = static_cast(sycl_device.allocate( out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> in_Arg1_gpu( gpu_in_arg1_data, tensorRange); TensorMap> in_Arg2_gpu( gpu_in_arg2_data, tensorRange); TensorMap> out_Argout_gpu( gpu_out_arg_data, argRange); TensorMap> out_Argout_gpu_helper( gpu_out_arg__gpu_helper_data, argRange); TensorMap> out_gpu(gpu_out_data); // CPU VERSION out_arg_cpu = (in_arg1.argmax(1) == in_arg2.argmax(1)) .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); full_redux = (out_arg_cpu.template cast()) .reduce(red_axis, Eigen::internal::MeanReducer()); // GPU VERSION sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); sycl_device.memcpyHostToDevice( gpu_in_arg1_data, in_arg1.data(), (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); sycl_device.memcpyHostToDevice( gpu_in_arg2_data, in_arg2.data(), (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); out_Argout_gpu_helper.device(sycl_device) = (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); out_Argout_gpu.device(sycl_device) = (out_Argout_gpu_helper) .select(out_Argout_gpu.constant(true), out_Argout_gpu.constant(false)); out_gpu.device(sycl_device) = (out_Argout_gpu.template cast()) .reduce(red_axis, Eigen::internal::MeanReducer()); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() << '\n'; VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_in_arg1_data); sycl_device.deallocate(gpu_in_arg2_data); sycl_device.deallocate(gpu_out_arg__gpu_helper_data); sycl_device.deallocate(gpu_out_arg_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_mean_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor; using scalar_tensor = Tensor; const IndexType num_rows = 64; const IndexType num_cols = 64; array tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; const IndexType offset = 64; TensorMap in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.mean(); VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); DataType* gpu_in_data = static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(sizeof(DataType))); TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.mean(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_mean_with_odd_offset_sycl( const Eigen::SyclDevice& sycl_device) { // This is a particular case which illustrates a possible problem when the // number of local threads in a workgroup is even, but is not a power of two. using data_tensor = Tensor; using scalar_tensor = Tensor; // 2177 = (17 * 128) + 1 gives rise to 18 local threads. // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. const IndexType n_elems = 8707; array tensor_range = {{n_elems}}; data_tensor in(tensor_range); DataType full_redux; DataType full_redux_gpu; TensorMap red_cpu(&full_redux); TensorMap red_gpu(&full_redux_gpu); const DataType const_val = static_cast(0.6391); in = in.constant(const_val); Eigen::IndexList> red_axis; red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer()); VERIFY_IS_APPROX(const_val, red_cpu()); DataType* gpu_in_data = static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(sizeof(DataType))); TensorMap in_gpu(gpu_in_data, tensor_range); TensorMap out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.reduce(red_axis, Eigen::internal::MeanReducer()); sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu, full_redux); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_min_sycl( const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 876; const IndexType num_cols = 953; array tensorRange = {{num_rows, num_cols}}; Tensor in(tensorRange); Tensor full_redux; Tensor full_redux_gpu; in.setRandom(); full_redux = in.minimum(); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_full_reductions_min_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor; using scalar_tensor = Tensor; const IndexType num_rows = 64; const IndexType num_cols = 64; array tensor_range = {{num_rows, num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); data_tensor in(tensor_range); scalar_tensor full_redux; scalar_tensor full_redux_gpu; in.setRandom(); array tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set the initial value to be the min. // As we don't include this in the reduction the result should not be -2. in(0) = static_cast(-2); const IndexType offset = 64; TensorMap in_offset(in.data() + offset, tensor_offset_range); full_redux = in_offset.minimum(); VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); DataType* gpu_in_data = static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(sizeof(DataType))); TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_first_dim_reductions_max_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; array tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array red_axis; red_axis[0] = 0; array reduced_tensorRange = {{dim_y, dim_z}}; Tensor in(tensorRange); Tensor redux(reduced_tensorRange); Tensor redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.maximum(red_axis); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_first_dim_reductions_max_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor; using reduced_tensor = Tensor; const IndexType num_rows = 64; const IndexType num_cols = 64; array tensor_range = {{num_rows, num_cols}}; array reduced_range = {{num_cols}}; const IndexType n_elems = internal::array_prod(tensor_range); const IndexType n_reduced = num_cols; data_tensor in(tensor_range); reduced_tensor redux; reduced_tensor redux_gpu(reduced_range); in.setRandom(); array tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set maximum value outside of the considered range. for (IndexType i = 0; i < n_reduced; i++) { in(i) = static_cast(2); } Eigen::array red_axis; red_axis[0] = 0; const IndexType offset = 64; TensorMap in_offset(in.data() + offset, tensor_offset_range); redux = in_offset.maximum(red_axis); for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_NOT_EQUAL(redux(i), in(i)); } DataType* gpu_in_data = static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast( sycl_device.allocate(n_reduced * sizeof(DataType))); TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap out_gpu(gpu_out_data, reduced_range); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_APPROX(redux_gpu(i), redux(i)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_last_dim_reductions_max_with_offset_sycl( const Eigen::SyclDevice& sycl_device) { using data_tensor = Tensor; using reduced_tensor = Tensor; const IndexType num_rows = 64; const IndexType num_cols = 64; array tensor_range = {{num_rows, num_cols}}; array full_reduced_range = {{num_rows}}; array reduced_range = {{num_rows - 1}}; const IndexType n_elems = internal::array_prod(tensor_range); const IndexType n_reduced = reduced_range[0]; data_tensor in(tensor_range); reduced_tensor redux(full_reduced_range); reduced_tensor redux_gpu(reduced_range); in.setRandom(); redux.setZero(); array tensor_offset_range(tensor_range); tensor_offset_range[0] -= 1; // Set maximum value outside of the considered range. for (IndexType i = 0; i < n_reduced; i++) { in(i) = static_cast(2); } Eigen::array red_axis; red_axis[0] = 1; const IndexType offset = 64; // Introduce an offset in both the input and the output. TensorMap in_offset(in.data() + offset, tensor_offset_range); TensorMap red_offset(redux.data() + 1, reduced_range); red_offset = in_offset.maximum(red_axis); // Check that the first value hasn't been changed and that the reduced values // are not equal to the previously set maximum in the input outside the range. VERIFY_IS_EQUAL(redux(0), static_cast(0)); for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); } DataType* gpu_in_data = static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); DataType* gpu_out_data = static_cast( sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); TensorMap out_gpu(gpu_out_data + 1, reduced_range); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < n_reduced; i++) { VERIFY_IS_APPROX(redux_gpu(i), red_offset(i)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_first_dim_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { array tensorRange = {{dim_x, dim_y}}; Eigen::array red_axis; red_axis[0] = 0; array reduced_tensorRange = {{dim_y}}; Tensor in(tensorRange); Tensor redux(reduced_tensorRange); Tensor redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.sum(red_axis); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < redux.size(); i++) { VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_first_dim_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; array tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array red_axis; red_axis[0] = 0; array reduced_tensorRange = {{dim_y, dim_z}}; Tensor in(tensorRange); Tensor redux(reduced_tensorRange); Tensor redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.mean(red_axis); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.mean(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_last_dim_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 64; IndexType dim_y = 1; IndexType dim_z = 32; array tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array red_axis; red_axis[0] = 2; array reduced_tensorRange = {{dim_x, dim_y}}; Tensor in(tensorRange); Tensor redux(reduced_tensorRange); Tensor redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.mean(red_axis); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.mean(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_last_dim_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 64; IndexType dim_y = 1; IndexType dim_z = 32; array tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array red_axis; red_axis[0] = 2; array reduced_tensorRange = {{dim_x, dim_y}}; Tensor in(tensorRange); Tensor redux(reduced_tensorRange); Tensor redux_gpu(reduced_tensorRange); in.setRandom(); redux = in.sum(red_axis); DataType* gpu_in_data = static_cast( sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate( redux_gpu.dimensions().TotalSize() * sizeof(DataType))); TensorMap> in_gpu(gpu_in_data, tensorRange); TensorMap> out_gpu( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_last_reductions_sum_sycl( const Eigen::SyclDevice& sycl_device) { auto tensorRange = Sizes<64, 32>(64, 32); // auto red_axis = Sizes<0,1>(0,1); Eigen::IndexList> red_axis; auto reduced_tensorRange = Sizes<64>(64); TensorFixedSize, DataLayout> in_fix; TensorFixedSize, DataLayout> redux_fix; TensorFixedSize, DataLayout> redux_gpu_fix; in_fix.setRandom(); redux_fix = in_fix.sum(red_axis); DataType* gpu_in_data = static_cast( sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate( redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); TensorMap, DataLayout>> in_gpu_fix( gpu_in_data, tensorRange); TensorMap, DataLayout>> out_gpu_fix( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType)); out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu_fix.data(), gpu_out_data, redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template static void test_last_reductions_mean_sycl( const Eigen::SyclDevice& sycl_device) { auto tensorRange = Sizes<64, 32>(64, 32); Eigen::IndexList> red_axis; auto reduced_tensorRange = Sizes<64>(64); TensorFixedSize, DataLayout> in_fix; TensorFixedSize, DataLayout> redux_fix; TensorFixedSize, DataLayout> redux_gpu_fix; in_fix.setRandom(); redux_fix = in_fix.mean(red_axis); DataType* gpu_in_data = static_cast( sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate( redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); TensorMap, DataLayout>> in_gpu_fix( gpu_in_data, tensorRange); TensorMap, DataLayout>> out_gpu_fix( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice( gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType)); out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); sycl_device.memcpyDeviceToHost( redux_gpu_fix.data(), gpu_out_data, redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); sycl_device.synchronize(); // Check that the CPU and GPU reductions return the same result. for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } // SYCL supports a generic case of reduction where the accumulator is a // different type than the input data This is an example on how to get if a // Tensor contains nan and/or inf in one reduction template struct CustomReducer { static const bool PacketAccess = false; static const bool IsStateful = false; static constexpr OutT InfBit = 1; static constexpr OutT NanBit = 2; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, OutT* accum) const { if (Eigen::numext::isinf(x)) *accum |= InfBit; else if (Eigen::numext::isnan(x)) *accum |= NanBit; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, OutT* accum) const { *accum |= x; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { return OutT(0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { return accum; } }; template static void test_full_reductions_custom_sycl( const Eigen::SyclDevice& sycl_device) { constexpr IndexType InSize = 64; auto tensorRange = Sizes(InSize); Eigen::IndexList> dims; auto reduced_tensorRange = Sizes<>(); TensorFixedSize, DataLayout> in_fix; TensorFixedSize, DataLayout> redux_gpu_fix; CustomReducer reducer; in_fix.setRandom(); size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); DataType* gpu_in_data = static_cast(sycl_device.allocate(in_size_bytes)); AccumType* gpu_out_data = static_cast(sycl_device.allocate(sizeof(AccumType))); TensorMap, DataLayout>> in_gpu_fix( gpu_in_data, tensorRange); TensorMap, DataLayout>> out_gpu_fix( gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, sizeof(AccumType)); VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template void sycl_reduction_test_full_per_device(const Dev& sycl_device) { test_full_reductions_sum_sycl(sycl_device); test_full_reductions_sum_sycl(sycl_device); test_full_reductions_min_sycl(sycl_device); test_full_reductions_min_sycl(sycl_device); test_full_reductions_max_sycl(sycl_device); test_full_reductions_max_sycl(sycl_device); test_full_reductions_mean_sycl(sycl_device); test_full_reductions_mean_sycl(sycl_device); test_full_reductions_custom_sycl( sycl_device); test_full_reductions_custom_sycl( sycl_device); sycl_device.synchronize(); } template void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { test_full_reductions_sum_with_offset_sycl( sycl_device); test_full_reductions_sum_with_offset_sycl( sycl_device); test_full_reductions_min_with_offset_sycl( sycl_device); test_full_reductions_min_with_offset_sycl( sycl_device); test_full_reductions_max_with_offset_sycl( sycl_device); test_full_reductions_max_with_offset_sycl( sycl_device); test_full_reductions_mean_with_offset_sycl( sycl_device); test_full_reductions_mean_with_offset_sycl( sycl_device); test_full_reductions_mean_with_odd_offset_sycl( sycl_device); sycl_device.synchronize(); } template void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { test_first_dim_reductions_sum_sycl(sycl_device, 4197, 4097); test_first_dim_reductions_sum_sycl(sycl_device, 4197, 4097); test_first_dim_reductions_sum_sycl(sycl_device, 129, 8); test_first_dim_reductions_max_sycl(sycl_device); test_first_dim_reductions_max_with_offset_sycl( sycl_device); sycl_device.synchronize(); } template void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { test_last_dim_reductions_sum_sycl(sycl_device); test_last_dim_reductions_max_with_offset_sycl( sycl_device); test_last_reductions_sum_sycl(sycl_device); test_last_reductions_sum_sycl(sycl_device); test_last_reductions_mean_sycl(sycl_device); test_last_reductions_mean_sycl(sycl_device); sycl_device.synchronize(); } EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { for (const auto& device : Eigen::get_sycl_supported_devices()) { std::cout << "Running on " << device.template get_info() << std::endl; QueueInterface queueInterface(device); auto sycl_device = Eigen::SyclDevice(&queueInterface); CALL_SUBTEST_1(sycl_reduction_test_full_per_device(sycl_device)); CALL_SUBTEST_2(sycl_reduction_full_offset_per_device(sycl_device)); CALL_SUBTEST_3( sycl_reduction_test_first_dim_per_device(sycl_device)); CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device(sycl_device)); } }