diff --git a/SYCLStream.cpp b/SYCLStream.cpp index d4a2fd0..04a225a 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -11,12 +11,20 @@ using namespace cl::sycl; -#define WGSIZE 256 // Cache list of devices bool cached = false; std::vector devices; void getDeviceList(void); +program * p; + +/* Forward declaration of SYCL kernels */ +namespace kernels { + class copy; + class mul; + class add; + class triad; +} template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) @@ -24,14 +32,6 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) if (!cached) getDeviceList(); - // The array size must be divisible by WGSIZE - if (ARRAY_SIZE % WGSIZE != 0) - { - std::stringstream ss; - ss << "Array size must be a multiple of " << WGSIZE; - throw std::runtime_error(ss.str()); - } - array_size = ARRAY_SIZE; if (device_index >= devices.size()) @@ -44,6 +44,14 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) queue = new cl::sycl::queue(dev); + /* Pre-build the kernels */ + p = new program(queue->get_context()); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + + // Create buffers d_a = new buffer(array_size); d_b = new buffer(array_size); @@ -57,6 +65,7 @@ SYCLStream::~SYCLStream() delete d_b; delete d_c; + delete p; delete queue; } @@ -67,9 +76,11 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) { - kc[item.get_global()] = ka[item.get_global()]; + auto id = item.get(); + kc[id[0]] = ka[id[0]]; }); }); queue->wait(); @@ -83,9 +94,11 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) { - kb[item.get_global()] = scalar * kc[item.get_global()]; + auto id = item.get(); + kb[id[0]] = scalar * kc[id[0]]; }); }); queue->wait(); @@ -99,9 +112,11 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) { - kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()]; + auto id = item.get(); + kc[id[0]] = ka[id[0]] + kb[id[0]]; }); }); queue->wait(); @@ -116,9 +131,11 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) { - ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()]; + auto id = item.get(); + ka[id] = kb[id[0]] + scalar * kc[id[0]]; }); }); queue->wait();