From 1a38b189542a97aff9e0b062e803ec5838364255 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 14:36:08 +0100 Subject: [PATCH 1/8] Add OpenACC version --- ACCStream.cpp | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++ ACCStream.h | 39 +++++++++++++++++ main.cpp | 7 ++++ 3 files changed, 159 insertions(+) create mode 100644 ACCStream.cpp create mode 100644 ACCStream.h diff --git a/ACCStream.cpp b/ACCStream.cpp new file mode 100644 index 0000000..ca5ef7b --- /dev/null +++ b/ACCStream.cpp @@ -0,0 +1,113 @@ + +#include "ACCStream.h" + +template +ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +{ + array_size = ARRAY_SIZE; + + // Set up data region on device + this->a = a; + this->b = b; + this->c = c; + #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +ACCStream::~ACCStream() +{ + // End data region on device + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::copy() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void ACCStream::mul() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void ACCStream::add() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void ACCStream::triad() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} +void listDevices(void) +{ + // Get number of devices +} + +template class ACCStream; +template class ACCStream; + diff --git a/ACCStream.h b/ACCStream.h new file mode 100644 index 0000000..4c69986 --- /dev/null +++ b/ACCStream.h @@ -0,0 +1,39 @@ + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "OpenACC" + +template +class ACCStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + ACCStream(const unsigned int, T*, T*, T*); + ~ACCStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + + + +}; + diff --git a/main.cpp b/main.cpp index 96f4e5c..209fc98 100644 --- a/main.cpp +++ b/main.cpp @@ -22,6 +22,9 @@ #include "CUDAStream.h" #elif defined(OCL) #include "OCLStream.h" +#elif defined(ACC) +#include "ACCStream.h" + #endif @@ -80,6 +83,10 @@ void run() // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); +#elif defined(ACC) + // Use the OpenACC implementation + stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data()); + #endif stream->write_arrays(a, b, c); From da4f91878834f3e2e3b907003eefcb77538e0f9b Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 3 May 2016 14:45:13 +0100 Subject: [PATCH 2/8] Add initial SYCL implementation --- CMakeLists.txt | 8 ++++++++ main.cpp | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9dda6cd..d43a22a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,3 +30,11 @@ else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) +# TODO: Find SYCL implementations somehow +if (true) + add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) +else () + message("Skipping SYCL...") +endif () diff --git a/main.cpp b/main.cpp index 209fc98..f12a7e1 100644 --- a/main.cpp +++ b/main.cpp @@ -24,7 +24,8 @@ #include "OCLStream.h" #elif defined(ACC) #include "ACCStream.h" - +#elif defined(SYCL) +#include "SYCLStream.h" #endif @@ -87,6 +88,10 @@ void run() // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data()); +#elif defined(SYCL) + // Use the SYCL implementation + stream = new SYCLStream(ARRAY_SIZE, deviceIndex); + #endif stream->write_arrays(a, b, c); @@ -285,4 +290,3 @@ void parseArguments(int argc, char *argv[]) } } } - From 40a0a6551dabbe7a810639275aa60ca9e64ae343 Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 3 May 2016 14:46:08 +0100 Subject: [PATCH 3/8] Remove extra -std=c++11 from CMake build --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d43a22a..fd9a10d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,6 @@ project(gpu-stream) set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 0) -list(APPEND CMAKE_CXX_FLAGS --std=c++11) - configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) From b45f311e0d4e6f3303e78c5327d86b59f9827dea Mon Sep 17 00:00:00 2001 From: James Price Date: Tue, 3 May 2016 14:48:35 +0100 Subject: [PATCH 4/8] Add missing SYCL source files --- SYCLStream.cpp | 144 +++++++++++++++++++++++++++++++++++++++++++++++++ SYCLStream.h | 45 ++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 SYCLStream.cpp create mode 100644 SYCLStream.h diff --git a/SYCLStream.cpp b/SYCLStream.cpp new file mode 100644 index 0000000..1314aa4 --- /dev/null +++ b/SYCLStream.cpp @@ -0,0 +1,144 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream.h" + +#include + +using namespace cl::sycl; + +template +SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) +{ + array_size = ARRAY_SIZE; + + // Create buffers + d_a = buffer(array_size); + d_b = buffer(array_size); + d_c = buffer(array_size); +} + +template +SYCLStream::~SYCLStream() +{ +} + +template +void SYCLStream::copy() +{ + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kc[index] = ka[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = 3.0; + queue.submit([&](handler &cgh) + { + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kb[index] = scalar * kc[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::add() +{ + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kc[index] = ka[index] + kb[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = 3.0; + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index){ + ka[index] = kb[index] + scalar * kc[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + auto _a = d_a.template get_access(); + auto _b = d_b.template get_access(); + auto _c = d_c.template get_access(); + for (int i = 0; i < array_size; i++) + { + _a[i] = a[i]; + _b[i] = b[i]; + _c[i] = c[i]; + } +} + +template +void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + auto _a = d_a.template get_access(); + auto _b = d_b.template get_access(); + auto _c = d_c.template get_access(); + for (int i = 0; i < array_size; i++) + { + a[i] = _a[i]; + b[i] = _b[i]; + c[i] = _c[i]; + } +} + +void listDevices(void) +{ + // TODO: Get actual list of devices + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + std::cout << "0: " << "triSYCL" << std::endl; + std::cout << std::endl; +} + +std::string getDeviceName(const int device) +{ + // TODO: Implement properly + return "triSYCL"; +} + +std::string getDeviceDriver(const int device) +{ + // TODO: Implement properly + return "triSCYL"; +} + + +template class SYCLStream; +template class SYCLStream; diff --git a/SYCLStream.h b/SYCLStream.h new file mode 100644 index 0000000..e10535e --- /dev/null +++ b/SYCLStream.h @@ -0,0 +1,45 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include "Stream.h" + +#include "CL/sycl.hpp" + +#define IMPLEMENTATION_STRING "SYCL" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // SYCL objects + cl::sycl::queue queue; + cl::sycl::buffer d_a; + cl::sycl::buffer d_b; + cl::sycl::buffer d_c; + + public: + + SYCLStream(const unsigned int, const int); + ~SYCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); From 0b0de4e0c3c17079d9b9cac3ab64ac0bb09326ff Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 14:50:09 +0100 Subject: [PATCH 5/8] Implement the OpenACC device string functions, and device selector --- ACCStream.cpp | 25 ++++++++++++++++++++++++- ACCStream.h | 2 +- main.cpp | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ACCStream.cpp b/ACCStream.cpp index ca5ef7b..85bf600 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -2,8 +2,11 @@ #include "ACCStream.h" template -ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) { + + acc_set_device_num(device, acc_device_nvidia); + array_size = ARRAY_SIZE; // Set up data region on device @@ -106,8 +109,28 @@ void ACCStream::triad() void listDevices(void) { // Get number of devices + int count = acc_get_num_devices(acc_device_nvidia); + + // Print device list + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << "There are " << count << " devices." << std::endl; + } } +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} template class ACCStream; template class ACCStream; diff --git a/ACCStream.h b/ACCStream.h index 4c69986..d6e5728 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -22,7 +22,7 @@ class ACCStream : public Stream T *c; public: - ACCStream(const unsigned int, T*, T*, T*); + ACCStream(const unsigned int, T*, T*, T*, int); ~ACCStream(); virtual void copy() override; diff --git a/main.cpp b/main.cpp index f12a7e1..59411b8 100644 --- a/main.cpp +++ b/main.cpp @@ -86,7 +86,7 @@ void run() #elif defined(ACC) // Use the OpenACC implementation - stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data()); + stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); #elif defined(SYCL) // Use the SYCL implementation From 31819b7778fd4fd7f99c3450b73e6b5f13630629 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 15:07:51 +0100 Subject: [PATCH 6/8] Add bones of OpenACC in CMake config --- CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index fd9a10d..1a15bc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,15 @@ else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) +# TODO: Find OpenACC implementations somehow +if (true) + add_executable(gpu-stream-acc main.cpp ACCStream.cpp) + target_compile_definitions(gpu-stream-acc PUBLIC ACC) + target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11") +else () + message("Skipping OpenACC...") +endif () + # TODO: Find SYCL implementations somehow if (true) add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) From 48cae0cbc1bd0bc667f87f43f19978214b81efe1 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 3 May 2016 15:20:26 +0100 Subject: [PATCH 7/8] Make sure CUDA nvcc builds with C++11 --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a15bc0..71a95d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) if (${CUDA_FOUND}) + list(APPEND CUDA_NVCC_FLAGS --std=c++11) cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) else (${CUDA_FOUND}) From f0afa0c1e43c43729dd26cca719d1db21844d279 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 4 May 2016 10:41:41 +0100 Subject: [PATCH 8/8] Add reference OpenMP 3.0 version --- OMP3Stream.cpp | 106 +++++++++++++++++++++++++++++++++++++++++++++++++ OMP3Stream.h | 35 ++++++++++++++++ main.cpp | 7 ++++ 3 files changed, 148 insertions(+) create mode 100644 OMP3Stream.cpp create mode 100644 OMP3Stream.h diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp new file mode 100644 index 0000000..8899cff --- /dev/null +++ b/OMP3Stream.cpp @@ -0,0 +1,106 @@ + +#include "OMP3Stream.h" + +template +OMP3Stream::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +{ + array_size = ARRAY_SIZE; + this->a = (T*)malloc(sizeof(T)*array_size); + this->b = (T*)malloc(sizeof(T)*array_size); + this->c = (T*)malloc(sizeof(T)*array_size); +} + +template +OMP3Stream::~OMP3Stream() +{ + free(a); + free(b); + free(c); +} + + +template +void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = h_a[i]; + b[i] = h_b[i]; + c[i] = h_c[i]; + } +} + +template +void OMP3Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +} + +template +void OMP3Stream::copy() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void OMP3Stream::mul() +{ + const T scalar = 3.0; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void OMP3Stream::add() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void OMP3Stream::triad() +{ + const T scalar = 3.0; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + + +template class OMP3Stream; +template class OMP3Stream; + diff --git a/OMP3Stream.h b/OMP3Stream.h new file mode 100644 index 0000000..15172c3 --- /dev/null +++ b/OMP3Stream.h @@ -0,0 +1,35 @@ + +#pragma once + +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Reference OpenMP" + +template +class OMP3Stream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + OMP3Stream(const unsigned int, T*, T*, T*); + ~OMP3Stream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/main.cpp b/main.cpp index 59411b8..18b8c72 100644 --- a/main.cpp +++ b/main.cpp @@ -26,6 +26,8 @@ #include "ACCStream.h" #elif defined(SYCL) #include "SYCLStream.h" +#elif defined(OMP3) +#include "OMP3Stream.h" #endif @@ -92,6 +94,11 @@ void run() // Use the SYCL implementation stream = new SYCLStream(ARRAY_SIZE, deviceIndex); +#elif defined(OMP3) + // Use the "reference" OpenMP 3 implementation + stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); + + #endif stream->write_arrays(a, b, c);