From 8c72b52f16a21cd77181ebeaac32f794075377c0 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 12 Jan 2021 11:00:54 +0000 Subject: [PATCH 01/26] [SYCL 2020] Use unnamed lamdas --- SYCLStream.cpp | 12 ++++++------ SYCLStream.h | 18 ------------------ 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 8ab642f..dfaf13c 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -91,7 +91,7 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kc[idx] = ka[idx]; }); @@ -107,7 +107,7 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kb[idx] = scalar * kc[idx]; }); @@ -123,7 +123,7 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kc[idx] = ka[idx] + kb[idx]; }); @@ -140,7 +140,7 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { ka[idx] = kb[idx] + scalar * kc[idx]; }); @@ -160,7 +160,7 @@ T SYCLStream::dot() auto wg_sum = accessor(range<1>(dot_wgsize), cgh); size_t N = array_size; - cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) { size_t i = item.get_global_id(0); size_t li = item.get_local_id(0); @@ -201,7 +201,7 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { auto id = item.get_id(0); ka[id] = initA; diff --git a/SYCLStream.h b/SYCLStream.h index df10946..cb1a45a 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -15,16 +15,6 @@ #define IMPLEMENTATION_STRING "SYCL" -namespace sycl_kernels -{ - template class init; - template class copy; - template class mul; - template class add; - template class triad; - template class dot; -} - template class SYCLStream : public Stream { @@ -39,14 +29,6 @@ class SYCLStream : public Stream cl::sycl::buffer *d_c; cl::sycl::buffer *d_sum; - // SYCL kernel names - typedef sycl_kernels::init init_kernel; - typedef sycl_kernels::copy copy_kernel; - typedef sycl_kernels::mul mul_kernel; - typedef sycl_kernels::add add_kernel; - typedef sycl_kernels::triad triad_kernel; - typedef sycl_kernels::dot dot_kernel; - // NDRange configuration for the dot kernel size_t dot_num_groups; size_t dot_wgsize; From e8faf6843d498daff1d37cba235decc2a9fa8366 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 12 Jan 2021 11:01:11 +0000 Subject: [PATCH 02/26] Remove old comment --- SYCLStream.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index dfaf13c..8f47304 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -292,6 +292,5 @@ std::string getDeviceDriver(const int device) return driver; } -// TODO: Fix kernel names to allow multiple template specializations template class SYCLStream; template class SYCLStream; From 501c61cfbde1fc30d21c5fd05f666860b4fe5485 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 12 Jan 2021 11:14:43 +0000 Subject: [PATCH 03/26] [SYCL 2020] update namespace from cl::sycl to sycl:: Also remove the use namespace to make it clear what comes from SYCL --- SYCLStream.cpp | 101 ++++++++++++++++++++++++------------------------- SYCLStream.h | 10 ++--- 2 files changed, 55 insertions(+), 56 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 8f47304..984d8d2 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -9,11 +9,9 @@ #include -using namespace cl::sycl; - // Cache list of devices bool cached = false; -std::vector devices; +std::vector devices; void getDeviceList(void); template @@ -26,18 +24,19 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); - device dev = devices[device_index]; + + sycl::device dev = devices[device_index]; // Determine sensible dot kernel NDRange configuration if (dev.is_cpu()) { - dot_num_groups = dev.get_info(); - dot_wgsize = dev.get_info() * 2; + dot_num_groups = dev.get_info(); + dot_wgsize = dev.get_info() * 2; } else { - dot_num_groups = dev.get_info() * 4; - dot_wgsize = dev.get_info(); + dot_num_groups = dev.get_info() * 4; + dot_wgsize = dev.get_info(); } // Print out device information @@ -45,7 +44,7 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; - queue = new cl::sycl::queue(dev, cl::sycl::async_handler{[&](cl::sycl::exception_list l) + queue = new sycl::queue(dev, sycl::async_handler{[&](sycl::exception_list l) { bool error = false; for(auto e: l) @@ -54,7 +53,7 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) { std::rethrow_exception(e); } - catch (cl::sycl::exception e) + catch (sycl::exception e) { std::cout << e.what(); error = true; @@ -67,10 +66,10 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) }}); // Create buffers - d_a = new buffer(array_size); - d_b = new buffer(array_size); - d_c = new buffer(array_size); - d_sum = new buffer(dot_num_groups); + d_a = new sycl::buffer(array_size); + d_b = new sycl::buffer(array_size); + d_c = new sycl::buffer(array_size); + d_sum = new sycl::buffer(dot_num_groups); } template @@ -87,11 +86,11 @@ SYCLStream::~SYCLStream() template void SYCLStream::copy() { - queue->submit([&](handler &cgh) + queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kc[idx] = ka[idx]; }); @@ -103,11 +102,11 @@ template void SYCLStream::mul() { const T scalar = startScalar; - queue->submit([&](handler &cgh) + queue->submit([&](sycl::handler &cgh) { - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kb[idx] = scalar * kc[idx]; }); @@ -118,12 +117,12 @@ void SYCLStream::mul() template void SYCLStream::add() { - queue->submit([&](handler &cgh) + queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kc[idx] = ka[idx] + kb[idx]; }); @@ -135,12 +134,12 @@ template void SYCLStream::triad() { const T scalar = startScalar; - queue->submit([&](handler &cgh) + queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { ka[idx] = kb[idx] + scalar * kc[idx]; }); @@ -151,16 +150,16 @@ void SYCLStream::triad() template T SYCLStream::dot() { - queue->submit([&](handler &cgh) + queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto ksum = d_sum->template get_access(cgh); + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto ksum = d_sum->template get_access(cgh); - auto wg_sum = accessor(range<1>(dot_wgsize), cgh); + auto wg_sum = sycl::accessor(sycl::range<1>(dot_wgsize), cgh); size_t N = array_size; - cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + cgh.parallel_for(sycl::nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](sycl::nd_item<1> item) { size_t i = item.get_global_id(0); size_t li = item.get_local_id(0); @@ -173,7 +172,7 @@ T SYCLStream::dot() size_t local_size = item.get_local_range()[0]; for (int offset = local_size / 2; offset > 0; offset /= 2) { - item.barrier(cl::sycl::access::fence_space::local_space); + item.barrier(sycl::access::fence_space::local_space); if (li < offset) wg_sum[li] += wg_sum[li + offset]; } @@ -184,7 +183,7 @@ T SYCLStream::dot() }); T sum = 0.0; - auto h_sum = d_sum->template get_access(); + auto h_sum = d_sum->template get_access(); for (int i = 0; i < dot_num_groups; i++) { sum += h_sum[i]; @@ -196,12 +195,12 @@ T SYCLStream::dot() template void SYCLStream::init_arrays(T initA, T initB, T initC) { - queue->submit([&](handler &cgh) + queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::item<1> item) { auto id = item.get_id(0); ka[id] = initA; @@ -215,9 +214,9 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) template void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - auto _a = d_a->template get_access(); - auto _b = d_b->template get_access(); - auto _c = d_c->template get_access(); + auto _a = d_a->template get_access(); + auto _b = d_b->template get_access(); + auto _c = d_c->template get_access(); for (int i = 0; i < array_size; i++) { a[i] = _a[i]; @@ -229,7 +228,7 @@ void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vecto void getDeviceList(void) { // Ask SYCL runtime for all devices in system - devices = cl::sycl::device::get_devices(); + devices = sycl::device::get_devices(); cached = true; } @@ -263,7 +262,7 @@ std::string getDeviceName(const int device) if (device < devices.size()) { - name = devices[device].get_info(); + name = devices[device].get_info(); } else { @@ -282,7 +281,7 @@ std::string getDeviceDriver(const int device) if (device < devices.size()) { - driver = devices[device].get_info(); + driver = devices[device].get_info(); } else { diff --git a/SYCLStream.h b/SYCLStream.h index cb1a45a..7b79ad2 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -23,11 +23,11 @@ class SYCLStream : public Stream int array_size; // SYCL objects - cl::sycl::queue *queue; - cl::sycl::buffer *d_a; - cl::sycl::buffer *d_b; - cl::sycl::buffer *d_c; - cl::sycl::buffer *d_sum; + sycl::queue *queue; + sycl::buffer *d_a; + sycl::buffer *d_b; + sycl::buffer *d_c; + sycl::buffer *d_sum; // NDRange configuration for the dot kernel size_t dot_num_groups; From 8f5357011a0b64f4ffb435466d67743acdd7016e Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 12 Jan 2021 11:16:46 +0000 Subject: [PATCH 04/26] [SYCL 2020] Use sycl::id for init kernel --- SYCLStream.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 984d8d2..5b9d78d 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -200,12 +200,11 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::item<1> item) + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { - auto id = item.get_id(0); - ka[id] = initA; - kb[id] = initB; - kc[id] = initC; + ka[idx] = initA; + kb[idx] = initB; + kc[idx] = initC; }); }); queue->wait(); From 282fb1e5e35092ae4f9c0b37c2f701730ad92f1b Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 12 Jan 2021 11:54:39 +0000 Subject: [PATCH 05/26] [SYCL 2020] Use accessor constructurs using CTAD and Tags instead of get_access --- SYCLStream.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 5b9d78d..e0fce5d 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -88,8 +88,8 @@ void SYCLStream::copy() { queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kc = d_c->template get_access(cgh); + sycl::accessor ka {*d_a, cgh, sycl::read_only}; + sycl::accessor kc {*d_c, cgh, sycl::write_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kc[idx] = ka[idx]; @@ -104,8 +104,8 @@ void SYCLStream::mul() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); + sycl::accessor kb {*d_b, cgh, sycl::write_only}; + sycl::accessor kc {*d_c, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kb[idx] = scalar * kc[idx]; @@ -119,9 +119,9 @@ void SYCLStream::add() { queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); + sycl::accessor ka {*d_a, cgh, sycl::read_only}; + sycl::accessor kb {*d_b, cgh, sycl::read_only}; + sycl::accessor kc {*d_c, cgh, sycl::write_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kc[idx] = ka[idx] + kb[idx]; @@ -136,9 +136,9 @@ void SYCLStream::triad() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); + sycl::accessor ka {*d_a, cgh, sycl::write_only}; + sycl::accessor kb {*d_b, cgh, sycl::read_only}; + sycl::accessor kc {*d_c, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { ka[idx] = kb[idx] + scalar * kc[idx]; @@ -152,10 +152,11 @@ T SYCLStream::dot() { queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto ksum = d_sum->template get_access(cgh); + sycl::accessor ka {*d_a, cgh, sycl::read_only}; + sycl::accessor kb {*d_b, cgh, sycl::read_only}; + sycl::accessor ksum {*d_sum, cgh, sycl::write_only}; + //sycl::local_accessor wg_sum {sycl::range<1>(dot_wgsize), cgh}; auto wg_sum = sycl::accessor(sycl::range<1>(dot_wgsize), cgh); size_t N = array_size; @@ -197,9 +198,10 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) { queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); + // TODO: could add the sycl::no_init property + sycl::accessor ka {*d_a, cgh, sycl::write_only}; + sycl::accessor kb {*d_b, cgh, sycl::write_only}; + sycl::accessor kc {*d_c, cgh, sycl::write_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { ka[idx] = initA; From b611db8cabe7bc08528df77baace8948825e28d5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 12 Jan 2021 11:58:14 +0000 Subject: [PATCH 06/26] [SYCL 2020] Use host accessor constructors --- SYCLStream.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index e0fce5d..049c26e 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -215,9 +215,9 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) template void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - auto _a = d_a->template get_access(); - auto _b = d_b->template get_access(); - auto _c = d_c->template get_access(); + sycl::host_accessor _a {*d_a, sycl::read_only}; + sycl::host_accessor _b {*d_b, sycl::read_only}; + sycl::host_accessor _c {*d_c, sycl::read_only}; for (int i = 0; i < array_size; i++) { a[i] = _a[i]; From aa0ab6a8e3a1814d7838de58aa6617a12a5fc3aa Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 21 Jan 2021 10:37:11 +0000 Subject: [PATCH 07/26] use new header path --- SYCLStream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCLStream.h b/SYCLStream.h index 7b79ad2..124ab28 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -11,7 +11,7 @@ #include "Stream.h" -#include "CL/sycl.hpp" +#include #define IMPLEMENTATION_STRING "SYCL" From 42c89547896b403d5b62046bbe0cdccdea883934 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 21 Jan 2021 10:37:56 +0000 Subject: [PATCH 08/26] [SYCL 2020] use new reduction for dot kernel --- SYCLStream.cpp | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 049c26e..109b883 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -69,7 +69,7 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) d_a = new sycl::buffer(array_size); d_b = new sycl::buffer(array_size); d_c = new sycl::buffer(array_size); - d_sum = new sycl::buffer(dot_num_groups); + d_sum = new sycl::buffer(1); } template @@ -150,47 +150,28 @@ void SYCLStream::triad() template T SYCLStream::dot() { + queue->submit([&](sycl::handler &cgh) { sycl::accessor ka {*d_a, cgh, sycl::read_only}; sycl::accessor kb {*d_b, cgh, sycl::read_only}; - sycl::accessor ksum {*d_sum, cgh, sycl::write_only}; - //sycl::local_accessor wg_sum {sycl::range<1>(dot_wgsize), cgh}; - auto wg_sum = sycl::accessor(sycl::range<1>(dot_wgsize), cgh); + // Reduction object, to perform summation + // Initialises the result to zero + auto sumReducer = sycl::reduction(*d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity); - size_t N = array_size; - cgh.parallel_for(sycl::nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](sycl::nd_item<1> item) + cgh.parallel_for(sycl::range<1>{array_size}, sumReducer, [=](sycl::id<1> idx, auto& sum) { - size_t i = item.get_global_id(0); - size_t li = item.get_local_id(0); - size_t global_size = item.get_global_range()[0]; - - wg_sum[li] = 0.0; - for (; i < N; i += global_size) - wg_sum[li] += ka[i] * kb[i]; - - size_t local_size = item.get_local_range()[0]; - for (int offset = local_size / 2; offset > 0; offset /= 2) - { - item.barrier(sycl::access::fence_space::local_space); - if (li < offset) - wg_sum[li] += wg_sum[li + offset]; - } - - if (li == 0) - ksum[item.get_group(0)] = wg_sum[0]; + sum += ka[idx] * kb[idx]; }); + }); - T sum = 0.0; - auto h_sum = d_sum->template get_access(); - for (int i = 0; i < dot_num_groups; i++) - { - sum += h_sum[i]; - } + // Get access on the host, and return a copy of the data (single number) + // This will block until the result is available, so no need to wait on the queue. + sycl::host_accessor result {*d_sum, sycl::read_only}; + return result[0]; - return sum; } template From 4726f3f0f1bf2f4ab8a17032e1431fff36e6c9de Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 21 Jan 2021 10:39:13 +0000 Subject: [PATCH 09/26] [SYCL 2020] Specify no_init property when initalising buffers --- SYCLStream.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 109b883..1c182cb 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -179,10 +179,10 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) { queue->submit([&](sycl::handler &cgh) { - // TODO: could add the sycl::no_init property - sycl::accessor ka {*d_a, cgh, sycl::write_only}; - sycl::accessor kb {*d_b, cgh, sycl::write_only}; - sycl::accessor kc {*d_c, cgh, sycl::write_only}; + sycl::accessor ka {*d_a, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kb {*d_b, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kc {*d_c, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { ka[idx] = initA; @@ -190,6 +190,7 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) kc[idx] = initC; }); }); + queue->wait(); } From b825df00746a13854cfdfd7265cb797fb57395b2 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 21 Jan 2021 18:18:35 +0000 Subject: [PATCH 10/26] [SYCL 2020] Declare reduction inline to reduce one variable name --- SYCLStream.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 1c182cb..d879c4e 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -156,14 +156,13 @@ T SYCLStream::dot() sycl::accessor ka {*d_a, cgh, sycl::read_only}; sycl::accessor kb {*d_b, cgh, sycl::read_only}; - // Reduction object, to perform summation - // Initialises the result to zero - auto sumReducer = sycl::reduction(*d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity); - - cgh.parallel_for(sycl::range<1>{array_size}, sumReducer, [=](sycl::id<1> idx, auto& sum) - { - sum += ka[idx] * kb[idx]; - }); + cgh.parallel_for(sycl::range<1>{array_size}, + // Reduction object, to perform summation - initialises the result to zero + sycl::reduction(*d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity); + [=](sycl::id<1> idx, auto& sum) + { + sum += ka[idx] * kb[idx]; + }); }); From 1517101ceb08b13f85b731c26262acc1ee742bae Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 8 Feb 2021 11:07:38 +0000 Subject: [PATCH 11/26] [SYCL 2020] Remove work-group heuristic for reduction as unused --- SYCLStream.cpp | 13 ------------- SYCLStream.h | 4 ---- 2 files changed, 17 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index d879c4e..8a1010e 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -27,22 +27,9 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) sycl::device dev = devices[device_index]; - // Determine sensible dot kernel NDRange configuration - if (dev.is_cpu()) - { - dot_num_groups = dev.get_info(); - dot_wgsize = dev.get_info() * 2; - } - else - { - dot_num_groups = dev.get_info() * 4; - dot_wgsize = dev.get_info(); - } - // Print out device information std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; queue = new sycl::queue(dev, sycl::async_handler{[&](sycl::exception_list l) { diff --git a/SYCLStream.h b/SYCLStream.h index 124ab28..3ebea6c 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -29,10 +29,6 @@ class SYCLStream : public Stream sycl::buffer *d_c; sycl::buffer *d_sum; - // NDRange configuration for the dot kernel - size_t dot_num_groups; - size_t dot_wgsize; - public: SYCLStream(const int, const int); From 1db9a6b64848e114794c31b0c5bede911f99be7c Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 8 Feb 2021 11:33:03 +0000 Subject: [PATCH 12/26] [SYCL 2020] Use smart pointers instead of raw pointers --- SYCLStream.cpp | 24 ++++++++++-------------- SYCLStream.h | 13 +++++++------ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 8a1010e..0f65560 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -51,24 +51,20 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) throw std::runtime_error("SYCL errors detected"); } }}); + + // No longer need list of devices + devices.clear(); + cached = true; // Create buffers - d_a = new sycl::buffer(array_size); - d_b = new sycl::buffer(array_size); - d_c = new sycl::buffer(array_size); - d_sum = new sycl::buffer(1); + // Only in the constructor at runtime do we know the size, so need to use (smart) pointers + d_a = std::make_unique>(array_size); + d_b = std::make_unique>(array_size); + d_c = std::make_unique>(array_size); + d_sum = std::make_unique>(1); + } -template -SYCLStream::~SYCLStream() -{ - delete d_a; - delete d_b; - delete d_c; - delete d_sum; - delete queue; - devices.clear(); -} template void SYCLStream::copy() diff --git a/SYCLStream.h b/SYCLStream.h index 3ebea6c..bfe9ae5 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include "Stream.h" @@ -23,16 +24,16 @@ class SYCLStream : public Stream int array_size; // SYCL objects - sycl::queue *queue; - sycl::buffer *d_a; - sycl::buffer *d_b; - sycl::buffer *d_c; - sycl::buffer *d_sum; + std::unique_ptr queue; + std::unique_ptr> d_a; + std::unique_ptr> d_b; + std::unique_ptr> d_c; + std::unique_ptr> d_sum; public: SYCLStream(const int, const int); - ~SYCLStream(); + ~SYCLStream() = default; virtual void copy() override; virtual void add() override; From 1336400311a7633080d3973c79151686c614937a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 8 Feb 2021 11:34:37 +0000 Subject: [PATCH 13/26] [SYCL 2020] Use unique pointer for queue constructor --- SYCLStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 0f65560..7765352 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -31,7 +31,7 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - queue = new sycl::queue(dev, sycl::async_handler{[&](sycl::exception_list l) + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) { bool error = false; for(auto e: l) From ae8bd6081be7d8c116e8f345e7f833bd0317b22c Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 8 Feb 2021 13:43:35 +0000 Subject: [PATCH 14/26] [SYCL 2020] Use constructor initaliser list to allocate buffers - no need to use pointers --- SYCLStream.cpp | 55 ++++++++++++++++++++++++-------------------------- SYCLStream.h | 11 ++++++---- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 7765352..d766996 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -16,12 +16,15 @@ void getDeviceList(void); template SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) +: array_size {ARRAY_SIZE}, + d_a {ARRAY_SIZE}, + d_b {ARRAY_SIZE}, + d_c {ARRAY_SIZE}, + d_sum {1} { if (!cached) getDeviceList(); - array_size = ARRAY_SIZE; - if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); @@ -55,13 +58,7 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) // No longer need list of devices devices.clear(); cached = true; - - // Create buffers - // Only in the constructor at runtime do we know the size, so need to use (smart) pointers - d_a = std::make_unique>(array_size); - d_b = std::make_unique>(array_size); - d_c = std::make_unique>(array_size); - d_sum = std::make_unique>(1); + } @@ -71,8 +68,8 @@ void SYCLStream::copy() { queue->submit([&](sycl::handler &cgh) { - sycl::accessor ka {*d_a, cgh, sycl::read_only}; - sycl::accessor kc {*d_c, cgh, sycl::write_only}; + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kc[idx] = ka[idx]; @@ -87,8 +84,8 @@ void SYCLStream::mul() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - sycl::accessor kb {*d_b, cgh, sycl::write_only}; - sycl::accessor kc {*d_c, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::write_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kb[idx] = scalar * kc[idx]; @@ -102,9 +99,9 @@ void SYCLStream::add() { queue->submit([&](sycl::handler &cgh) { - sycl::accessor ka {*d_a, cgh, sycl::read_only}; - sycl::accessor kb {*d_b, cgh, sycl::read_only}; - sycl::accessor kc {*d_c, cgh, sycl::write_only}; + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { kc[idx] = ka[idx] + kb[idx]; @@ -119,9 +116,9 @@ void SYCLStream::triad() const T scalar = startScalar; queue->submit([&](sycl::handler &cgh) { - sycl::accessor ka {*d_a, cgh, sycl::write_only}; - sycl::accessor kb {*d_b, cgh, sycl::read_only}; - sycl::accessor kc {*d_c, cgh, sycl::read_only}; + sycl::accessor ka {d_a, cgh, sycl::write_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { ka[idx] = kb[idx] + scalar * kc[idx]; @@ -136,12 +133,12 @@ T SYCLStream::dot() queue->submit([&](sycl::handler &cgh) { - sycl::accessor ka {*d_a, cgh, sycl::read_only}; - sycl::accessor kb {*d_b, cgh, sycl::read_only}; + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(*d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity); + sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity); [=](sycl::id<1> idx, auto& sum) { sum += ka[idx] * kb[idx]; @@ -151,7 +148,7 @@ T SYCLStream::dot() // Get access on the host, and return a copy of the data (single number) // This will block until the result is available, so no need to wait on the queue. - sycl::host_accessor result {*d_sum, sycl::read_only}; + sycl::host_accessor result {d_sum, sycl::read_only}; return result[0]; } @@ -161,9 +158,9 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) { queue->submit([&](sycl::handler &cgh) { - sycl::accessor ka {*d_a, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kb {*d_b, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kc {*d_c, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { @@ -179,9 +176,9 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) template void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - sycl::host_accessor _a {*d_a, sycl::read_only}; - sycl::host_accessor _b {*d_b, sycl::read_only}; - sycl::host_accessor _c {*d_c, sycl::read_only}; + sycl::host_accessor _a {d_a, sycl::read_only}; + sycl::host_accessor _b {d_b, sycl::read_only}; + sycl::host_accessor _c {d_c, sycl::read_only}; for (int i = 0; i < array_size; i++) { a[i] = _a[i]; diff --git a/SYCLStream.h b/SYCLStream.h index bfe9ae5..0c73594 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -24,11 +24,14 @@ class SYCLStream : public Stream int array_size; // SYCL objects + // Queue is a pointer because we allow device selection std::unique_ptr queue; - std::unique_ptr> d_a; - std::unique_ptr> d_b; - std::unique_ptr> d_c; - std::unique_ptr> d_sum; + + // Buffers + sycl::buffer d_a; + sycl::buffer d_b; + sycl::buffer d_c; + sycl::buffer d_sum; public: From 707bc5d0bfcc4ee94b5600b1b4c36db8d0268d68 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 9 Feb 2021 10:31:18 +0000 Subject: [PATCH 15/26] Update SYCL version to SYCL 2020 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a1cfe5e..7d45f82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ All notable changes to this project will be documented in this file. - Cray compiler OpenMP flags updated. - Clang compiler OpenMP flags corrected for NVIDIA target. - Reorder OpenCL objects in class so destructors are called in safe order. +- Update SYCL version to SYCL 2020. ### Removed - Pre-building of kernels in SYCL version to ensure compatibility with SYCL 1.2.1. From c9247cd27f489683151b351dfb9b5669ccd43fbf Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 17 Feb 2021 16:35:41 +0000 Subject: [PATCH 16/26] [SYCL-2020] fix semicolon typo --- SYCLStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index d766996..028a3df 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -138,7 +138,7 @@ T SYCLStream::dot() cgh.parallel_for(sycl::range<1>{array_size}, // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity); + sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity), [=](sycl::id<1> idx, auto& sum) { sum += ka[idx] * kb[idx]; From 98d2140a88f1862b5ba3d93cf6a96a8b915b00b6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 17 Feb 2021 16:48:42 +0000 Subject: [PATCH 17/26] [SYCL 2020] Add note about glibc --- SYCL.make | 3 +++ 1 file changed, 3 insertions(+) diff --git a/SYCL.make b/SYCL.make index 4326da5..b251f2e 100644 --- a/SYCL.make +++ b/SYCL.make @@ -7,6 +7,9 @@ Available compilers are: For HIPSYCL and COMPUTECPP, SYCL_SDK_DIR must be specified, the directory should contain [/lib, /bin, ...] For DPCPP, the compiler must be on path + + You may need to use the following if running old glibc: + EXTRA_FLAGS="--gcc-toolchain=$(realpath "$(dirname "$(which gcc)")"/..)" endef $(info $(compiler_help)) COMPILER=HIPSYCL From 67da8f6a8eeb98f62bb7fe5641913b82d909d3d8 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 22 Feb 2021 15:19:56 +0000 Subject: [PATCH 18/26] [SYCL 2020] Add nstream kernel --- SYCLStream.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 3eab481..f3dd55f 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -131,12 +131,13 @@ template void SYCLStream::nstream() { const T scalar = startScalar; - queue->submit([&](handler &cgh) + + queue->submit([&](sycl::handler &cgh) { - auto ka = d_a->template get_access(cgh); - auto kb = d_b->template get_access(cgh); - auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + sycl::accessor ka {d_a, cgh}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) { ka[idx] += kb[idx] + scalar * kc[idx]; }); From 9f38177e1b8b5fdf49cc2cdd6eb0ef45e51d057c Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 17 May 2021 15:32:42 +0100 Subject: [PATCH 19/26] [SYCL 2020] Add check for FP64 support using device aspects. This will resolve #98 in the future SYCL 2020 version. --- SYCLStream.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index b4fa514..9e23175 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -34,6 +34,15 @@ SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (!dev.has(sycl::aspect::fp64)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) { bool error = false; From 5d9e408a06b330f550b7359526d90e7b85127477 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 4 Jun 2021 16:42:49 +0000 Subject: [PATCH 20/26] [SYCL 2020] Make array size a size_t --- SYCLStream.cpp | 2 +- SYCLStream.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 9e23175..76c0a8b 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -15,7 +15,7 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) +SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) : array_size {ARRAY_SIZE}, d_a {ARRAY_SIZE}, d_b {ARRAY_SIZE}, diff --git a/SYCLStream.h b/SYCLStream.h index cd8e39a..e8b9134 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -21,7 +21,7 @@ class SYCLStream : public Stream { protected: // Size of arrays - int array_size; + size_t array_size; // SYCL objects // Queue is a pointer because we allow device selection @@ -35,7 +35,7 @@ class SYCLStream : public Stream public: - SYCLStream(const int, const int); + SYCLStream(const size_t, const int); ~SYCLStream() = default; virtual void copy() override; From edcc3e79cda649e2e3268268fbaf8d3fa9961566 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 9 Dec 2021 11:25:26 +0000 Subject: [PATCH 21/26] fix sycl 2020 after merge from main --- src/sycl/SYCLStream.cpp | 22 ---------------------- src/sycl/SYCLStream.h | 6 +----- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index 3f44537..76c0a8b 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -28,29 +28,7 @@ SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); -<<<<<<< HEAD:SYCLStream.cpp sycl::device dev = devices[device_index]; -======= - // Check device can support FP64 if needed - if (sizeof(T) == sizeof(double)) - { - if (dev.get_info().size() == 0) { - throw std::runtime_error("Device does not support double precision, please use --float"); - } - } - - // Determine sensible dot kernel NDRange configuration - if (dev.is_cpu()) - { - dot_num_groups = dev.get_info(); - dot_wgsize = dev.get_info() * 2; - } - else - { - dot_num_groups = dev.get_info() * 4; - dot_wgsize = dev.get_info(); - } ->>>>>>> main:src/sycl/SYCLStream.cpp // Print out device information std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h index 2ed95f6..7481d16 100644 --- a/src/sycl/SYCLStream.h +++ b/src/sycl/SYCLStream.h @@ -11,14 +11,10 @@ #include #include "Stream.h" -<<<<<<< HEAD:SYCLStream.h #include -======= -#include "CL/sycl.hpp" ->>>>>>> main:src/sycl/SYCLStream.h -#define IMPLEMENTATION_STRING "SYCL" +#define IMPLEMENTATION_STRING "SYCL 2020" template class SYCLStream : public Stream From e077d149dcabc8ea29012b8190ab8b91173d1da1 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 9 Dec 2021 11:26:17 +0000 Subject: [PATCH 22/26] Retain 1.2.1 and 2020 versions of SYCL --- src/sycl/SYCLStream.cpp | 192 ++++++++++++++---------- src/sycl/SYCLStream.h | 43 ++++-- src/sycl/SYCLStream2020.cpp | 284 ++++++++++++++++++++++++++++++++++++ src/sycl/SYCLStream2020.h | 54 +++++++ 4 files changed, 484 insertions(+), 89 deletions(-) create mode 100644 src/sycl/SYCLStream2020.cpp create mode 100644 src/sycl/SYCLStream2020.h diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index 76c0a8b..00c043f 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -9,41 +9,51 @@ #include +using namespace cl::sycl; + // Cache list of devices bool cached = false; -std::vector devices; +std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) -: array_size {ARRAY_SIZE}, - d_a {ARRAY_SIZE}, - d_b {ARRAY_SIZE}, - d_c {ARRAY_SIZE}, - d_sum {1} +SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) { if (!cached) getDeviceList(); + array_size = ARRAY_SIZE; + if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); - - sycl::device dev = devices[device_index]; - - // Print out device information - std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; - std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + device dev = devices[device_index]; // Check device can support FP64 if needed if (sizeof(T) == sizeof(double)) { - if (!dev.has(sycl::aspect::fp64)) - { + if (dev.get_info().size() == 0) { throw std::runtime_error("Device does not support double precision, please use --float"); } } - queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + // Determine sensible dot kernel NDRange configuration + if (dev.is_cpu()) + { + dot_num_groups = dev.get_info(); + dot_wgsize = dev.get_info() * 2; + } + else + { + dot_num_groups = dev.get_info() * 4; + dot_wgsize = dev.get_info(); + } + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; + + queue = new cl::sycl::queue(dev, cl::sycl::async_handler{[&](cl::sycl::exception_list l) { bool error = false; for(auto e: l) @@ -52,7 +62,7 @@ SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) { std::rethrow_exception(e); } - catch (sycl::exception e) + catch (cl::sycl::exception e) { std::cout << e.what(); error = true; @@ -63,23 +73,33 @@ SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) throw std::runtime_error("SYCL errors detected"); } }}); - - // No longer need list of devices - devices.clear(); - cached = true; - - + + // Create buffers + d_a = new buffer(array_size); + d_b = new buffer(array_size); + d_c = new buffer(array_size); + d_sum = new buffer(dot_num_groups); } +template +SYCLStream::~SYCLStream() +{ + delete d_a; + delete d_b; + delete d_c; + delete d_sum; + delete queue; + devices.clear(); +} template void SYCLStream::copy() { - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kc[idx] = ka[idx]; }); @@ -91,11 +111,11 @@ template void SYCLStream::mul() { const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor kb {d_b, cgh, sycl::write_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kb[idx] = scalar * kc[idx]; }); @@ -106,12 +126,12 @@ void SYCLStream::mul() template void SYCLStream::add() { - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kc[idx] = ka[idx] + kb[idx]; }); @@ -123,12 +143,12 @@ template void SYCLStream::triad() { const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::write_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { ka[idx] = kb[idx] + scalar * kc[idx]; }); @@ -140,13 +160,12 @@ template void SYCLStream::nstream() { const T scalar = startScalar; - - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { ka[idx] += kb[idx] + scalar * kc[idx]; }); @@ -157,55 +176,73 @@ void SYCLStream::nstream() template T SYCLStream::dot() { - - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto ksum = d_sum->template get_access(cgh); - cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity), - [=](sycl::id<1> idx, auto& sum) + auto wg_sum = accessor(range<1>(dot_wgsize), cgh); + + size_t N = array_size; + cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + { + size_t i = item.get_global_id(0); + size_t li = item.get_local_id(0); + size_t global_size = item.get_global_range()[0]; + + wg_sum[li] = 0.0; + for (; i < N; i += global_size) + wg_sum[li] += ka[i] * kb[i]; + + size_t local_size = item.get_local_range()[0]; + for (int offset = local_size / 2; offset > 0; offset /= 2) { - sum += ka[idx] * kb[idx]; - }); + item.barrier(cl::sycl::access::fence_space::local_space); + if (li < offset) + wg_sum[li] += wg_sum[li + offset]; + } + if (li == 0) + ksum[item.get_group(0)] = wg_sum[0]; + }); }); - // Get access on the host, and return a copy of the data (single number) - // This will block until the result is available, so no need to wait on the queue. - sycl::host_accessor result {d_sum, sycl::read_only}; - return result[0]; + T sum = 0.0; + auto h_sum = d_sum->template get_access(); + for (int i = 0; i < dot_num_groups; i++) + { + sum += h_sum[i]; + } + return sum; } template void SYCLStream::init_arrays(T initA, T initB, T initC) { - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; - - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { - ka[idx] = initA; - kb[idx] = initB; - kc[idx] = initC; + auto id = item.get_id(0); + ka[id] = initA; + kb[id] = initB; + kc[id] = initC; }); }); - queue->wait(); } template void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - sycl::host_accessor _a {d_a, sycl::read_only}; - sycl::host_accessor _b {d_b, sycl::read_only}; - sycl::host_accessor _c {d_c, sycl::read_only}; + auto _a = d_a->template get_access(); + auto _b = d_b->template get_access(); + auto _c = d_c->template get_access(); for (int i = 0; i < array_size; i++) { a[i] = _a[i]; @@ -217,7 +254,7 @@ void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vecto void getDeviceList(void) { // Ask SYCL runtime for all devices in system - devices = sycl::device::get_devices(); + devices = cl::sycl::device::get_devices(); cached = true; } @@ -251,7 +288,7 @@ std::string getDeviceName(const int device) if (device < devices.size()) { - name = devices[device].get_info(); + name = devices[device].get_info(); } else { @@ -270,7 +307,7 @@ std::string getDeviceDriver(const int device) if (device < devices.size()) { - driver = devices[device].get_info(); + driver = devices[device].get_info(); } else { @@ -280,5 +317,6 @@ std::string getDeviceDriver(const int device) return driver; } +// TODO: Fix kernel names to allow multiple template specializations template class SYCLStream; template class SYCLStream; diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h index 7481d16..d3fa18d 100644 --- a/src/sycl/SYCLStream.h +++ b/src/sycl/SYCLStream.h @@ -8,13 +8,22 @@ #pragma once #include -#include #include "Stream.h" +#include "CL/sycl.hpp" -#include +#define IMPLEMENTATION_STRING "SYCL" -#define IMPLEMENTATION_STRING "SYCL 2020" +namespace sycl_kernels +{ + template class init; + template class copy; + template class mul; + template class add; + template class triad; + template class nstream; + template class dot; +} template class SYCLStream : public Stream @@ -24,19 +33,29 @@ class SYCLStream : public Stream size_t array_size; // SYCL objects - // Queue is a pointer because we allow device selection - std::unique_ptr queue; + cl::sycl::queue *queue; + cl::sycl::buffer *d_a; + cl::sycl::buffer *d_b; + cl::sycl::buffer *d_c; + cl::sycl::buffer *d_sum; - // Buffers - sycl::buffer d_a; - sycl::buffer d_b; - sycl::buffer d_c; - sycl::buffer d_sum; + // SYCL kernel names + typedef sycl_kernels::init init_kernel; + typedef sycl_kernels::copy copy_kernel; + typedef sycl_kernels::mul mul_kernel; + typedef sycl_kernels::add add_kernel; + typedef sycl_kernels::triad triad_kernel; + typedef sycl_kernels::nstream nstream_kernel; + typedef sycl_kernels::dot dot_kernel; + + // NDRange configuration for the dot kernel + size_t dot_num_groups; + size_t dot_wgsize; public: - SYCLStream(const size_t, const int); - ~SYCLStream() = default; + SYCLStream(const int, const int); + ~SYCLStream(); virtual void copy() override; virtual void add() override; diff --git a/src/sycl/SYCLStream2020.cpp b/src/sycl/SYCLStream2020.cpp new file mode 100644 index 0000000..76c0a8b --- /dev/null +++ b/src/sycl/SYCLStream2020.cpp @@ -0,0 +1,284 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream.h" + +#include + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +template +SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +: array_size {ARRAY_SIZE}, + d_a {ARRAY_SIZE}, + d_b {ARRAY_SIZE}, + d_c {ARRAY_SIZE}, + d_sum {1} +{ + if (!cached) + getDeviceList(); + + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + + sycl::device dev = devices[device_index]; + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (!dev.has(sycl::aspect::fp64)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + { + bool error = false; + for(auto e: l) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception e) + { + std::cout << e.what(); + error = true; + } + } + if(error) + { + throw std::runtime_error("SYCL errors detected"); + } + }}); + + // No longer need list of devices + devices.clear(); + cached = true; + + +} + + +template +void SYCLStream::copy() +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kc[idx] = ka[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor kb {d_b, cgh, sycl::write_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kb[idx] = scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::add() +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kc[idx] = ka[idx] + kb[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::write_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] = kb[idx] + scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::nstream() +{ + const T scalar = startScalar; + + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] += kb[idx] + scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +T SYCLStream::dot() +{ + + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + + cgh.parallel_for(sycl::range<1>{array_size}, + // Reduction object, to perform summation - initialises the result to zero + sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity), + [=](sycl::id<1> idx, auto& sum) + { + sum += ka[idx] * kb[idx]; + }); + + }); + + // Get access on the host, and return a copy of the data (single number) + // This will block until the result is available, so no need to wait on the queue. + sycl::host_accessor result {d_sum, sycl::read_only}; + return result[0]; + +} + +template +void SYCLStream::init_arrays(T initA, T initB, T initC) +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; + + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] = initA; + kb[idx] = initB; + kc[idx] = initC; + }); + }); + + queue->wait(); +} + +template +void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + sycl::host_accessor _a {d_a, sycl::read_only}; + sycl::host_accessor _b {d_b, sycl::read_only}; + sycl::host_accessor _c {d_c, sycl::read_only}; + for (int i = 0; i < array_size; i++) + { + a[i] = _a[i]; + b[i] = _b[i]; + c[i] = _c[i]; + } +} + +void getDeviceList(void) +{ + // Ask SYCL runtime for all devices in system + devices = sycl::device::get_devices(); + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + +template class SYCLStream; +template class SYCLStream; diff --git a/src/sycl/SYCLStream2020.h b/src/sycl/SYCLStream2020.h new file mode 100644 index 0000000..7481d16 --- /dev/null +++ b/src/sycl/SYCLStream2020.h @@ -0,0 +1,54 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "SYCL 2020" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + size_t array_size; + + // SYCL objects + // Queue is a pointer because we allow device selection + std::unique_ptr queue; + + // Buffers + sycl::buffer d_a; + sycl::buffer d_b; + sycl::buffer d_c; + sycl::buffer d_sum; + + public: + + SYCLStream(const size_t, const int); + ~SYCLStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); From b70c7f0357afa0fb6fa48d4098dcbcae65c10ced Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 9 Dec 2021 11:27:44 +0000 Subject: [PATCH 23/26] update headers --- src/main.cpp | 4 +++- src/sycl/SYCLStream2020.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 2791bdc..5a01b74 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -43,6 +43,8 @@ #include "ACCStream.h" #elif defined(SYCL) #include "SYCLStream.h" +#elif defined(SYCL2020) +#include "SYCLStream2020.h" #elif defined(OMP) #include "OMPStream.h" #endif @@ -282,7 +284,7 @@ void run() // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, deviceIndex); -#elif defined(SYCL) +#elif defined(SYCL) || defined(SYCL2020) // Use the SYCL implementation stream = new SYCLStream(ARRAY_SIZE, deviceIndex); diff --git a/src/sycl/SYCLStream2020.cpp b/src/sycl/SYCLStream2020.cpp index 76c0a8b..6a0dd96 100644 --- a/src/sycl/SYCLStream2020.cpp +++ b/src/sycl/SYCLStream2020.cpp @@ -5,7 +5,7 @@ // For full license terms please see the LICENSE file distributed with this // source code -#include "SYCLStream.h" +#include "SYCLStream2020.h" #include From 2f1187f0d5290d1f59b0882f40f09895fd5d4198 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 9 Dec 2021 11:40:30 +0000 Subject: [PATCH 24/26] Move SYCL2020 to subdirectory --- CMakeLists.txt | 3 +- src/sycl2020/.SYCLStream2020.h.swp | Bin 0 -> 12288 bytes src/{sycl => sycl2020}/SYCLStream2020.cpp | 0 src/{sycl => sycl2020}/SYCLStream2020.h | 3 +- src/sycl2020/model.cmake | 86 ++++++++++++++++++++++ 5 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 src/sycl2020/.SYCLStream2020.h.swp rename src/{sycl => sycl2020}/SYCLStream2020.cpp (100%) rename src/{sycl => sycl2020}/SYCLStream2020.h (95%) create mode 100644 src/sycl2020/model.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 58e0a3b..ad12dbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,6 +123,7 @@ register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) register_model(sycl SYCL SYCLStream.cpp) +register_model(sycl2020 SYCL2020 SYCLStream2020.cpp) register_model(acc ACC ACCStream.cpp) # defining RAJA collides with the RAJA namespace so USE_RAJA register_model(raja USE_RAJA RAJAStream.cpp) @@ -206,4 +207,4 @@ if (COMMAND setup_target) setup_target(${EXE_NAME}) endif () -install(TARGETS ${EXE_NAME} DESTINATION bin) \ No newline at end of file +install(TARGETS ${EXE_NAME} DESTINATION bin) diff --git a/src/sycl2020/.SYCLStream2020.h.swp b/src/sycl2020/.SYCLStream2020.h.swp new file mode 100644 index 0000000000000000000000000000000000000000..4b4ae277bfb5a4610edfce2e422910db10b9e910 GIT binary patch literal 12288 zcmeI2O^+Kj7{^_Xg#zN`z=b}f5^SZs?Y5;T8$!EjU?XLhg`^0|A@a;P8<&~!V0*Hp zg>pjTQ1Kl=AjA;~z5yo$M{aNf4hTL&`FkeY-R-K|9w~C9pC-1Sd2Ije87Yc`=3D&@ zI$uA_@OhlEKmPib|8n!s$|oN)W>w;9@A8=;DXX#_kFv9Vh4 zq+xW}F!L~8X|6OI!*{xaVNvgj_34obeWN$v4cwoB+)ipQonU8HmYeQ|Q#;koi=w}H1Kxl);0<^K z-hemY4R{0IfH&X`cmv+R189IpjIAGM?83tsJpTXR{r!LIF~)uXUxP2f+n^8D!76Bh z7r@itDZs$3M;ZGa{0x2qUx3fSXW%NBf_Fg=)IkkA1AaTf*!SQ&@D;cT47dVb0UdA_ zoB_wdKaVi>H~0~J13m=;TmUD*v*1_E=Rn7Vi@~ql`+C+Cbi9Z zOhq0XhmFQuYd$wi#zw8yOoUeH_Nenb@haU}6&$V#Pm`Jntz{%u59gPuw7dC7YhrAxRF-6Fv9L&!~gCW?G!H#X^_jMyPQfq*+$yAbG)Vzs$yE->bk}*_PUna rQ%g&D1*|le&()S+q>)PKlHl*jv`RynsFXIserlDO)`khvR~h>k;R44k literal 0 HcmV?d00001 diff --git a/src/sycl/SYCLStream2020.cpp b/src/sycl2020/SYCLStream2020.cpp similarity index 100% rename from src/sycl/SYCLStream2020.cpp rename to src/sycl2020/SYCLStream2020.cpp diff --git a/src/sycl/SYCLStream2020.h b/src/sycl2020/SYCLStream2020.h similarity index 95% rename from src/sycl/SYCLStream2020.h rename to src/sycl2020/SYCLStream2020.h index 7481d16..74b4221 100644 --- a/src/sycl/SYCLStream2020.h +++ b/src/sycl2020/SYCLStream2020.h @@ -12,7 +12,8 @@ #include "Stream.h" -#include +//#include +#include #define IMPLEMENTATION_STRING "SYCL 2020" diff --git a/src/sycl2020/model.cmake b/src/sycl2020/model.cmake new file mode 100644 index 0000000..e7b5a1c --- /dev/null +++ b/src/sycl2020/model.cmake @@ -0,0 +1,86 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" + "c++") + +register_flag_required(SYCL_COMPILER + "Compile using the specified SYCL compiler implementation + Supported values are + ONEAPI-DPCPP - dpc++ that is part of an oneAPI Base Toolkit distribution (https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html) + DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) + HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) + COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") + +register_flag_optional(SYCL_COMPILER_DIR + "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: + ONEAPI-DPCPP - not required but `dpcpp` must be on PATH, load oneAPI as per documentation (i.e `source /opt/intel/oneapi/setvars.sh` first) + HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + "") + +register_flag_optional(OpenCL_LIBRARY + "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so" + "${OpenCL_LIBRARY}") + +macro(setup) + set(CMAKE_CXX_STANDARD 17) + + + if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + + + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) + + if (NOT EXISTS "${hipSYCL_DIR}") + message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") + set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + endif () + if (NOT EXISTS "${hipSYCL_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + endif () + + # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) + find_package(hipSYCL CONFIG REQUIRED) + message(STATUS "ok") + + elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") + + list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) + + setup_opencl_header_includes() + + register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) + # ComputeCpp needs OpenCL + find_package(ComputeCpp REQUIRED) + + # this must come after FindComputeCpp (!) + set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) + + elseif (${SYCL_COMPILER} STREQUAL "DPCPP") + set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) + include_directories(${SYCL_COMPILER_DIR}/include/sycl) + register_definitions(CL_TARGET_OPENCL_VERSION=220) + register_append_cxx_flags(ANY -fsycl) + register_append_link_flags(-fsycl) + elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-DPCPP") + set(CMAKE_CXX_COMPILER dpcpp) + register_definitions(CL_TARGET_OPENCL_VERSION=220) + else () + message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") + endif () + +endmacro() + + +macro(setup_target NAME) + if ( + (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR + (${SYCL_COMPILER} STREQUAL "HIPSYCL")) + # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their + # own custom integration header flags AFTER the target has been specified + # hence this macro here + add_sycl_to_target( + TARGET ${NAME} + SOURCES ${IMPL_SOURCES}) + endif () +endmacro() From 9e9cefe8593944cfd6c939a15646f3c4c1ae0ae0 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 9 Dec 2021 11:52:47 +0000 Subject: [PATCH 25/26] remove spurious .swp file --- src/sycl2020/.SYCLStream2020.h.swp | Bin 12288 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/sycl2020/.SYCLStream2020.h.swp diff --git a/src/sycl2020/.SYCLStream2020.h.swp b/src/sycl2020/.SYCLStream2020.h.swp deleted file mode 100644 index 4b4ae277bfb5a4610edfce2e422910db10b9e910..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2O^+Kj7{^_Xg#zN`z=b}f5^SZs?Y5;T8$!EjU?XLhg`^0|A@a;P8<&~!V0*Hp zg>pjTQ1Kl=AjA;~z5yo$M{aNf4hTL&`FkeY-R-K|9w~C9pC-1Sd2Ije87Yc`=3D&@ zI$uA_@OhlEKmPib|8n!s$|oN)W>w;9@A8=;DXX#_kFv9Vh4 zq+xW}F!L~8X|6OI!*{xaVNvgj_34obeWN$v4cwoB+)ipQonU8HmYeQ|Q#;koi=w}H1Kxl);0<^K z-hemY4R{0IfH&X`cmv+R189IpjIAGM?83tsJpTXR{r!LIF~)uXUxP2f+n^8D!76Bh z7r@itDZs$3M;ZGa{0x2qUx3fSXW%NBf_Fg=)IkkA1AaTf*!SQ&@D;cT47dVb0UdA_ zoB_wdKaVi>H~0~J13m=;TmUD*v*1_E=Rn7Vi@~ql`+C+Cbi9Z zOhq0XhmFQuYd$wi#zw8yOoUeH_Nenb@haU}6&$V#Pm`Jntz{%u59gPuw7dC7YhrAxRF-6Fv9L&!~gCW?G!H#X^_jMyPQfq*+$yAbG)Vzs$yE->bk}*_PUna rQ%g&D1*|le&()S+q>)PKlHl*jv`RynsFXIserlDO)`khvR~h>k;R44k From 610c1734a9e27c2b01d76a51278994193dd08fde Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 9 Dec 2021 11:53:21 +0000 Subject: [PATCH 26/26] Fix SYCL 2020 header file name and reduction identity typos --- src/sycl2020/SYCLStream2020.cpp | 2 +- src/sycl2020/SYCLStream2020.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sycl2020/SYCLStream2020.cpp b/src/sycl2020/SYCLStream2020.cpp index 6a0dd96..17a5ab5 100644 --- a/src/sycl2020/SYCLStream2020.cpp +++ b/src/sycl2020/SYCLStream2020.cpp @@ -165,7 +165,7 @@ T SYCLStream::dot() cgh.parallel_for(sycl::range<1>{array_size}, // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity), + sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity{}), [=](sycl::id<1> idx, auto& sum) { sum += ka[idx] * kb[idx]; diff --git a/src/sycl2020/SYCLStream2020.h b/src/sycl2020/SYCLStream2020.h index 74b4221..7481d16 100644 --- a/src/sycl2020/SYCLStream2020.h +++ b/src/sycl2020/SYCLStream2020.h @@ -12,8 +12,7 @@ #include "Stream.h" -//#include -#include +#include #define IMPLEMENTATION_STRING "SYCL 2020"