From 42c89547896b403d5b62046bbe0cdccdea883934 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 21 Jan 2021 10:37:56 +0000 Subject: [PATCH] [SYCL 2020] use new reduction for dot kernel --- SYCLStream.cpp | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 049c26e..109b883 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -69,7 +69,7 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) d_a = new sycl::buffer(array_size); d_b = new sycl::buffer(array_size); d_c = new sycl::buffer(array_size); - d_sum = new sycl::buffer(dot_num_groups); + d_sum = new sycl::buffer(1); } template @@ -150,47 +150,28 @@ void SYCLStream::triad() template T SYCLStream::dot() { + queue->submit([&](sycl::handler &cgh) { sycl::accessor ka {*d_a, cgh, sycl::read_only}; sycl::accessor kb {*d_b, cgh, sycl::read_only}; - sycl::accessor ksum {*d_sum, cgh, sycl::write_only}; - //sycl::local_accessor wg_sum {sycl::range<1>(dot_wgsize), cgh}; - auto wg_sum = sycl::accessor(sycl::range<1>(dot_wgsize), cgh); + // Reduction object, to perform summation + // Initialises the result to zero + auto sumReducer = sycl::reduction(*d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity); - size_t N = array_size; - cgh.parallel_for(sycl::nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](sycl::nd_item<1> item) + cgh.parallel_for(sycl::range<1>{array_size}, sumReducer, [=](sycl::id<1> idx, auto& sum) { - size_t i = item.get_global_id(0); - size_t li = item.get_local_id(0); - size_t global_size = item.get_global_range()[0]; - - wg_sum[li] = 0.0; - for (; i < N; i += global_size) - wg_sum[li] += ka[i] * kb[i]; - - size_t local_size = item.get_local_range()[0]; - for (int offset = local_size / 2; offset > 0; offset /= 2) - { - item.barrier(sycl::access::fence_space::local_space); - if (li < offset) - wg_sum[li] += wg_sum[li + offset]; - } - - if (li == 0) - ksum[item.get_group(0)] = wg_sum[0]; + sum += ka[idx] * kb[idx]; }); + }); - T sum = 0.0; - auto h_sum = d_sum->template get_access(); - for (int i = 0; i < dot_num_groups; i++) - { - sum += h_sum[i]; - } + // Get access on the host, and return a copy of the data (single number) + // This will block until the result is available, so no need to wait on the queue. + sycl::host_accessor result {*d_sum, sycl::read_only}; + return result[0]; - return sum; } template