diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index 76c0a8b..00c043f 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -9,41 +9,51 @@ #include +using namespace cl::sycl; + // Cache list of devices bool cached = false; -std::vector devices; +std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) -: array_size {ARRAY_SIZE}, - d_a {ARRAY_SIZE}, - d_b {ARRAY_SIZE}, - d_c {ARRAY_SIZE}, - d_sum {1} +SYCLStream::SYCLStream(const int ARRAY_SIZE, const int device_index) { if (!cached) getDeviceList(); + array_size = ARRAY_SIZE; + if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); - - sycl::device dev = devices[device_index]; - - // Print out device information - std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; - std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + device dev = devices[device_index]; // Check device can support FP64 if needed if (sizeof(T) == sizeof(double)) { - if (!dev.has(sycl::aspect::fp64)) - { + if (dev.get_info().size() == 0) { throw std::runtime_error("Device does not support double precision, please use --float"); } } - queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + // Determine sensible dot kernel NDRange configuration + if (dev.is_cpu()) + { + dot_num_groups = dev.get_info(); + dot_wgsize = dev.get_info() * 2; + } + else + { + dot_num_groups = dev.get_info() * 4; + dot_wgsize = dev.get_info(); + } + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; + + queue = new cl::sycl::queue(dev, cl::sycl::async_handler{[&](cl::sycl::exception_list l) { bool error = false; for(auto e: l) @@ -52,7 +62,7 @@ SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) { std::rethrow_exception(e); } - catch (sycl::exception e) + catch (cl::sycl::exception e) { std::cout << e.what(); error = true; @@ -63,23 +73,33 @@ SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) throw std::runtime_error("SYCL errors detected"); } }}); - - // No longer need list of devices - devices.clear(); - cached = true; - - + + // Create buffers + d_a = new buffer(array_size); + d_b = new buffer(array_size); + d_c = new buffer(array_size); + d_sum = new buffer(dot_num_groups); } +template +SYCLStream::~SYCLStream() +{ + delete d_a; + delete d_b; + delete d_c; + delete d_sum; + delete queue; + devices.clear(); +} template void SYCLStream::copy() { - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kc[idx] = ka[idx]; }); @@ -91,11 +111,11 @@ template void SYCLStream::mul() { const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor kb {d_b, cgh, sycl::write_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kb[idx] = scalar * kc[idx]; }); @@ -106,12 +126,12 @@ void SYCLStream::mul() template void SYCLStream::add() { - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::write_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { kc[idx] = ka[idx] + kb[idx]; }); @@ -123,12 +143,12 @@ template void SYCLStream::triad() { const T scalar = startScalar; - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::write_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { ka[idx] = kb[idx] + scalar * kc[idx]; }); @@ -140,13 +160,12 @@ template void SYCLStream::nstream() { const T scalar = startScalar; - - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; - sycl::accessor kc {d_c, cgh, sycl::read_only}; - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) { ka[idx] += kb[idx] + scalar * kc[idx]; }); @@ -157,55 +176,73 @@ void SYCLStream::nstream() template T SYCLStream::dot() { - - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::read_only}; - sycl::accessor kb {d_b, cgh, sycl::read_only}; + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto ksum = d_sum->template get_access(cgh); - cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity), - [=](sycl::id<1> idx, auto& sum) + auto wg_sum = accessor(range<1>(dot_wgsize), cgh); + + size_t N = array_size; + cgh.parallel_for(nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + { + size_t i = item.get_global_id(0); + size_t li = item.get_local_id(0); + size_t global_size = item.get_global_range()[0]; + + wg_sum[li] = 0.0; + for (; i < N; i += global_size) + wg_sum[li] += ka[i] * kb[i]; + + size_t local_size = item.get_local_range()[0]; + for (int offset = local_size / 2; offset > 0; offset /= 2) { - sum += ka[idx] * kb[idx]; - }); + item.barrier(cl::sycl::access::fence_space::local_space); + if (li < offset) + wg_sum[li] += wg_sum[li + offset]; + } + if (li == 0) + ksum[item.get_group(0)] = wg_sum[0]; + }); }); - // Get access on the host, and return a copy of the data (single number) - // This will block until the result is available, so no need to wait on the queue. - sycl::host_accessor result {d_sum, sycl::read_only}; - return result[0]; + T sum = 0.0; + auto h_sum = d_sum->template get_access(); + for (int i = 0; i < dot_num_groups; i++) + { + sum += h_sum[i]; + } + return sum; } template void SYCLStream::init_arrays(T initA, T initB, T initC) { - queue->submit([&](sycl::handler &cgh) + queue->submit([&](handler &cgh) { - sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; - sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; - - cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](item<1> item) { - ka[idx] = initA; - kb[idx] = initB; - kc[idx] = initC; + auto id = item.get_id(0); + ka[id] = initA; + kb[id] = initB; + kc[id] = initC; }); }); - queue->wait(); } template void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - sycl::host_accessor _a {d_a, sycl::read_only}; - sycl::host_accessor _b {d_b, sycl::read_only}; - sycl::host_accessor _c {d_c, sycl::read_only}; + auto _a = d_a->template get_access(); + auto _b = d_b->template get_access(); + auto _c = d_c->template get_access(); for (int i = 0; i < array_size; i++) { a[i] = _a[i]; @@ -217,7 +254,7 @@ void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vecto void getDeviceList(void) { // Ask SYCL runtime for all devices in system - devices = sycl::device::get_devices(); + devices = cl::sycl::device::get_devices(); cached = true; } @@ -251,7 +288,7 @@ std::string getDeviceName(const int device) if (device < devices.size()) { - name = devices[device].get_info(); + name = devices[device].get_info(); } else { @@ -270,7 +307,7 @@ std::string getDeviceDriver(const int device) if (device < devices.size()) { - driver = devices[device].get_info(); + driver = devices[device].get_info(); } else { @@ -280,5 +317,6 @@ std::string getDeviceDriver(const int device) return driver; } +// TODO: Fix kernel names to allow multiple template specializations template class SYCLStream; template class SYCLStream; diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h index 7481d16..d3fa18d 100644 --- a/src/sycl/SYCLStream.h +++ b/src/sycl/SYCLStream.h @@ -8,13 +8,22 @@ #pragma once #include -#include #include "Stream.h" +#include "CL/sycl.hpp" -#include +#define IMPLEMENTATION_STRING "SYCL" -#define IMPLEMENTATION_STRING "SYCL 2020" +namespace sycl_kernels +{ + template class init; + template class copy; + template class mul; + template class add; + template class triad; + template class nstream; + template class dot; +} template class SYCLStream : public Stream @@ -24,19 +33,29 @@ class SYCLStream : public Stream size_t array_size; // SYCL objects - // Queue is a pointer because we allow device selection - std::unique_ptr queue; + cl::sycl::queue *queue; + cl::sycl::buffer *d_a; + cl::sycl::buffer *d_b; + cl::sycl::buffer *d_c; + cl::sycl::buffer *d_sum; - // Buffers - sycl::buffer d_a; - sycl::buffer d_b; - sycl::buffer d_c; - sycl::buffer d_sum; + // SYCL kernel names + typedef sycl_kernels::init init_kernel; + typedef sycl_kernels::copy copy_kernel; + typedef sycl_kernels::mul mul_kernel; + typedef sycl_kernels::add add_kernel; + typedef sycl_kernels::triad triad_kernel; + typedef sycl_kernels::nstream nstream_kernel; + typedef sycl_kernels::dot dot_kernel; + + // NDRange configuration for the dot kernel + size_t dot_num_groups; + size_t dot_wgsize; public: - SYCLStream(const size_t, const int); - ~SYCLStream() = default; + SYCLStream(const int, const int); + ~SYCLStream(); virtual void copy() override; virtual void add() override; diff --git a/src/sycl/SYCLStream2020.cpp b/src/sycl/SYCLStream2020.cpp new file mode 100644 index 0000000..76c0a8b --- /dev/null +++ b/src/sycl/SYCLStream2020.cpp @@ -0,0 +1,284 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream.h" + +#include + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +template +SYCLStream::SYCLStream(const size_t ARRAY_SIZE, const int device_index) +: array_size {ARRAY_SIZE}, + d_a {ARRAY_SIZE}, + d_b {ARRAY_SIZE}, + d_c {ARRAY_SIZE}, + d_sum {1} +{ + if (!cached) + getDeviceList(); + + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + + sycl::device dev = devices[device_index]; + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // Check device can support FP64 if needed + if (sizeof(T) == sizeof(double)) + { + if (!dev.has(sycl::aspect::fp64)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + } + + queue = std::make_unique(dev, sycl::async_handler{[&](sycl::exception_list l) + { + bool error = false; + for(auto e: l) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception e) + { + std::cout << e.what(); + error = true; + } + } + if(error) + { + throw std::runtime_error("SYCL errors detected"); + } + }}); + + // No longer need list of devices + devices.clear(); + cached = true; + + +} + + +template +void SYCLStream::copy() +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kc[idx] = ka[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor kb {d_b, cgh, sycl::write_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kb[idx] = scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::add() +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::write_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + kc[idx] = ka[idx] + kb[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = startScalar; + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::write_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] = kb[idx] + scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::nstream() +{ + const T scalar = startScalar; + + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + sycl::accessor kc {d_c, cgh, sycl::read_only}; + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] += kb[idx] + scalar * kc[idx]; + }); + }); + queue->wait(); +} + +template +T SYCLStream::dot() +{ + + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::read_only}; + sycl::accessor kb {d_b, cgh, sycl::read_only}; + + cgh.parallel_for(sycl::range<1>{array_size}, + // Reduction object, to perform summation - initialises the result to zero + sycl::reduction(d_sum, cgh, std::plus(), sycl::property::reduction::initialize_to_identity), + [=](sycl::id<1> idx, auto& sum) + { + sum += ka[idx] * kb[idx]; + }); + + }); + + // Get access on the host, and return a copy of the data (single number) + // This will block until the result is available, so no need to wait on the queue. + sycl::host_accessor result {d_sum, sycl::read_only}; + return result[0]; + +} + +template +void SYCLStream::init_arrays(T initA, T initB, T initC) +{ + queue->submit([&](sycl::handler &cgh) + { + sycl::accessor ka {d_a, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kb {d_b, cgh, sycl::write_only, sycl::no_init}; + sycl::accessor kc {d_c, cgh, sycl::write_only, sycl::no_init}; + + cgh.parallel_for(sycl::range<1>{array_size}, [=](sycl::id<1> idx) + { + ka[idx] = initA; + kb[idx] = initB; + kc[idx] = initC; + }); + }); + + queue->wait(); +} + +template +void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + sycl::host_accessor _a {d_a, sycl::read_only}; + sycl::host_accessor _b {d_b, sycl::read_only}; + sycl::host_accessor _c {d_c, sycl::read_only}; + for (int i = 0; i < array_size; i++) + { + a[i] = _a[i]; + b[i] = _b[i]; + c[i] = _c[i]; + } +} + +void getDeviceList(void) +{ + // Ask SYCL runtime for all devices in system + devices = sycl::device::get_devices(); + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + +template class SYCLStream; +template class SYCLStream; diff --git a/src/sycl/SYCLStream2020.h b/src/sycl/SYCLStream2020.h new file mode 100644 index 0000000..7481d16 --- /dev/null +++ b/src/sycl/SYCLStream2020.h @@ -0,0 +1,54 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "SYCL 2020" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + size_t array_size; + + // SYCL objects + // Queue is a pointer because we allow device selection + std::unique_ptr queue; + + // Buffers + sycl::buffer d_a; + sycl::buffer d_b; + sycl::buffer d_c; + sycl::buffer d_sum; + + public: + + SYCLStream(const size_t, const int); + ~SYCLStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void);