From 60817e25a1fbf71c88f688e4d5b2c45cdd73c432 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Tue, 18 Jun 2019 17:22:49 +0100 Subject: [PATCH 1/4] fix deprecated use of get_global() and get_local() --- SYCLStream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 171c9f7..983e965 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -185,8 +185,8 @@ T SYCLStream::dot() cgh.parallel_for(p->get_kernel(), nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) { - size_t i = item.get_global(0); - size_t li = item.get_local(0); + size_t i = item.get_global_id(0); + size_t li = item.get_local_id(0); size_t global_size = item.get_global_range()[0]; wg_sum[li] = 0.0; From a2e53d6728afe66d779f79aaa67448cc3511abe2 Mon Sep 17 00:00:00 2001 From: GeorgeWeb Date: Tue, 18 Jun 2019 17:31:40 +0100 Subject: [PATCH 2/4] remove use of pre-built kernel in parallel_for as is not conformant with the SYCL spec. (yet) --- SYCLStream.cpp | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 983e965..45cf477 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -16,7 +16,6 @@ using namespace cl::sycl; bool cached = false; std::vector devices; void getDeviceList(void); -program * p; template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) @@ -68,15 +67,6 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) } }); - /* Pre-build the kernels */ - p = new program(queue->get_context()); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - // Create buffers d_a = new buffer(array_size); d_b = new buffer(array_size); @@ -92,7 +82,7 @@ SYCLStream::~SYCLStream() delete d_c; delete d_sum; - delete p; + delete queue; devices.clear(); } @@ -104,8 +94,7 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { auto id = item.get_id(0); kc[id] = ka[id]; @@ -122,8 +111,7 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { auto id = item.get_id(0); kb[id] = scalar * kc[id]; @@ -140,8 +128,7 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { auto id = item.get_id(0); kc[id] = ka[id] + kb[id]; @@ -159,8 +146,7 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { auto id = item.get_id(0); ka[id] = kb[id] + scalar * kc[id]; @@ -182,8 +168,7 @@ T SYCLStream::dot() size_t N = array_size; - cgh.parallel_for(p->get_kernel(), - nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) { size_t i = item.get_global_id(0); size_t li = item.get_local_id(0); @@ -224,8 +209,7 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { auto id = item.get_id(0); ka[id] = initA; From 54737d87cb2977601a98919371cd09a6513d035f Mon Sep 17 00:00:00 2001 From: GeorgeWeb Date: Thu, 20 Jun 2019 10:13:39 +0100 Subject: [PATCH 3/4] enclosing computecpp specific code in macros, rather than removing it --- SYCLStream.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- SYCLStream.h | 3 +++ 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 45cf477..995eb1b 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -16,6 +16,9 @@ using namespace cl::sycl; bool cached = false; std::vector devices; void getDeviceList(void); +#ifdef COMPUTECPP_CE +program * p; +#endif template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) @@ -66,6 +69,17 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) throw std::runtime_error("SYCL errors detected"); } }); + + #ifdef COMPUTECPP_CE + /* Pre-build the kernels */ + p = new program(queue->get_context()); + p->build_with_kernel_type(); + p->build_with_kernel_type(); + p->build_with_kernel_type(); + p->build_with_kernel_type(); + p->build_with_kernel_type(); + p->build_with_kernel_type(); + #endif // Create buffers d_a = new buffer(array_size); @@ -81,8 +95,9 @@ SYCLStream::~SYCLStream() delete d_b; delete d_c; delete d_sum; - - + #ifdef COMPUTECPP_CE + delete p; + #endif delete queue; devices.clear(); } @@ -94,7 +109,12 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); + #ifdef COMPUTECPP_CE + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + #endif { auto id = item.get_id(0); kc[id] = ka[id]; @@ -111,7 +131,12 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); + #ifdef COMPUTECPP_CE + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + #endif { auto id = item.get_id(0); kb[id] = scalar * kc[id]; @@ -128,7 +153,12 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); + #ifdef COMPUTECPP_CE + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + #endif { auto id = item.get_id(0); kc[id] = ka[id] + kb[id]; @@ -146,7 +176,12 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); + #ifdef COMPUTECPP_CE + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + #endif { auto id = item.get_id(0); ka[id] = kb[id] + scalar * kc[id]; @@ -167,8 +202,12 @@ T SYCLStream::dot() auto wg_sum = accessor(range<1>(dot_wgsize), cgh); size_t N = array_size; - + #ifdef COMPUTECPP_CE + cgh.parallel_for(p->get_kernel(), + nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + #else cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + #endif { size_t i = item.get_global_id(0); size_t li = item.get_local_id(0); @@ -209,7 +248,12 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); + #ifdef COMPUTECPP_CE + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + #endif { auto id = item.get_id(0); ka[id] = initA; diff --git a/SYCLStream.h b/SYCLStream.h index ab62ecd..9c06f6f 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -15,6 +15,9 @@ #define IMPLEMENTATION_STRING "SYCL" +// allows a use of 'parallel_for' currently known to be supported by ComputeCpp +#define COMPUTECPP_CE + namespace sycl_kernels { template class init; From e657bfa8973bdcaf9a85c8be9c20f9631f9a5fd6 Mon Sep 17 00:00:00 2001 From: GeorgeWeb Date: Thu, 20 Jun 2019 14:24:46 +0100 Subject: [PATCH 4/4] based on perf comparison, and discussions, the use pre-built kernels is unnecessary in this case --- SYCLStream.cpp | 46 ---------------------------------------------- SYCLStream.h | 3 --- 2 files changed, 49 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 995eb1b..ebe5f63 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -11,14 +11,11 @@ using namespace cl::sycl; - // Cache list of devices bool cached = false; std::vector devices; void getDeviceList(void); -#ifdef COMPUTECPP_CE program * p; -#endif template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) @@ -70,17 +67,6 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) } }); - #ifdef COMPUTECPP_CE - /* Pre-build the kernels */ - p = new program(queue->get_context()); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - p->build_with_kernel_type(); - #endif - // Create buffers d_a = new buffer(array_size); d_b = new buffer(array_size); @@ -95,9 +81,7 @@ SYCLStream::~SYCLStream() delete d_b; delete d_c; delete d_sum; - #ifdef COMPUTECPP_CE delete p; - #endif delete queue; devices.clear(); } @@ -109,12 +93,7 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - #ifdef COMPUTECPP_CE - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) - #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) - #endif { auto id = item.get_id(0); kc[id] = ka[id]; @@ -131,12 +110,7 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - #ifdef COMPUTECPP_CE - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) - #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) - #endif { auto id = item.get_id(0); kb[id] = scalar * kc[id]; @@ -153,12 +127,7 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - #ifdef COMPUTECPP_CE - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) - #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) - #endif { auto id = item.get_id(0); kc[id] = ka[id] + kb[id]; @@ -176,12 +145,7 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - #ifdef COMPUTECPP_CE - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) - #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) - #endif { auto id = item.get_id(0); ka[id] = kb[id] + scalar * kc[id]; @@ -202,12 +166,7 @@ T SYCLStream::dot() auto wg_sum = accessor(range<1>(dot_wgsize), cgh); size_t N = array_size; - #ifdef COMPUTECPP_CE - cgh.parallel_for(p->get_kernel(), - nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) - #else cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) - #endif { size_t i = item.get_global_id(0); size_t li = item.get_local_id(0); @@ -248,12 +207,7 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - #ifdef COMPUTECPP_CE - cgh.parallel_for(p->get_kernel(), - range<1>{array_size}, [=](item<1> item) - #else cgh.parallel_for(range<1>{array_size}, [=](item<1> item) - #endif { auto id = item.get_id(0); ka[id] = initA; diff --git a/SYCLStream.h b/SYCLStream.h index 9c06f6f..ab62ecd 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -15,9 +15,6 @@ #define IMPLEMENTATION_STRING "SYCL" -// allows a use of 'parallel_for' currently known to be supported by ComputeCpp -#define COMPUTECPP_CE - namespace sycl_kernels { template class init;