diff --git a/ACCStream.cpp b/ACCStream.cpp new file mode 100644 index 0000000..85bf600 --- /dev/null +++ b/ACCStream.cpp @@ -0,0 +1,136 @@ + +#include "ACCStream.h" + +template +ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) +{ + + acc_set_device_num(device, acc_device_nvidia); + + array_size = ARRAY_SIZE; + + // Set up data region on device + this->a = a; + this->b = b; + this->c = c; + #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +ACCStream::~ACCStream() +{ + // End data region on device + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::copy() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void ACCStream::mul() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void ACCStream::add() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void ACCStream::triad() +{ + const T scalar = 3.0; + + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} +void listDevices(void) +{ + // Get number of devices + int count = acc_get_num_devices(acc_device_nvidia); + + // Print device list + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << "There are " << count << " devices." << std::endl; + } +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class ACCStream; +template class ACCStream; + diff --git a/ACCStream.h b/ACCStream.h new file mode 100644 index 0000000..d6e5728 --- /dev/null +++ b/ACCStream.h @@ -0,0 +1,39 @@ + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "OpenACC" + +template +class ACCStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + ACCStream(const unsigned int, T*, T*, T*, int); + ~ACCStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + + + +}; + diff --git a/CMakeLists.txt b/CMakeLists.txt index 9dda6cd..71a95d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,12 +9,11 @@ project(gpu-stream) set(gpu-stream_VERSION_MAJOR 2) set(gpu-stream_VERSION_MINOR 0) -list(APPEND CMAKE_CXX_FLAGS --std=c++11) - configure_file(common.h.in common.h) find_package(CUDA 7.0 QUIET) if (${CUDA_FOUND}) + list(APPEND CUDA_NVCC_FLAGS --std=c++11) cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) else (${CUDA_FOUND}) @@ -30,3 +29,20 @@ else (${OpenCL_FOUND}) message("Skipping OpenCL...") endif (${OpenCL_FOUND}) +# TODO: Find OpenACC implementations somehow +if (true) + add_executable(gpu-stream-acc main.cpp ACCStream.cpp) + target_compile_definitions(gpu-stream-acc PUBLIC ACC) + target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11") +else () + message("Skipping OpenACC...") +endif () + +# TODO: Find SYCL implementations somehow +if (true) + add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) +else () + message("Skipping SYCL...") +endif () diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp new file mode 100644 index 0000000..8899cff --- /dev/null +++ b/OMP3Stream.cpp @@ -0,0 +1,106 @@ + +#include "OMP3Stream.h" + +template +OMP3Stream::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +{ + array_size = ARRAY_SIZE; + this->a = (T*)malloc(sizeof(T)*array_size); + this->b = (T*)malloc(sizeof(T)*array_size); + this->c = (T*)malloc(sizeof(T)*array_size); +} + +template +OMP3Stream::~OMP3Stream() +{ + free(a); + free(b); + free(c); +} + + +template +void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = h_a[i]; + b[i] = h_b[i]; + c[i] = h_c[i]; + } +} + +template +void OMP3Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +} + +template +void OMP3Stream::copy() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void OMP3Stream::mul() +{ + const T scalar = 3.0; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void OMP3Stream::add() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void OMP3Stream::triad() +{ + const T scalar = 3.0; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + + +template class OMP3Stream; +template class OMP3Stream; + diff --git a/OMP3Stream.h b/OMP3Stream.h new file mode 100644 index 0000000..15172c3 --- /dev/null +++ b/OMP3Stream.h @@ -0,0 +1,35 @@ + +#pragma once + +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Reference OpenMP" + +template +class OMP3Stream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + OMP3Stream(const unsigned int, T*, T*, T*); + ~OMP3Stream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + diff --git a/SYCLStream.cpp b/SYCLStream.cpp new file mode 100644 index 0000000..1314aa4 --- /dev/null +++ b/SYCLStream.cpp @@ -0,0 +1,144 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream.h" + +#include + +using namespace cl::sycl; + +template +SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) +{ + array_size = ARRAY_SIZE; + + // Create buffers + d_a = buffer(array_size); + d_b = buffer(array_size); + d_c = buffer(array_size); +} + +template +SYCLStream::~SYCLStream() +{ +} + +template +void SYCLStream::copy() +{ + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kc[index] = ka[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = 3.0; + queue.submit([&](handler &cgh) + { + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kb[index] = scalar * kc[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::add() +{ + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index) + { + kc[index] = ka[index] + kb[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = 3.0; + queue.submit([&](handler &cgh) + { + auto ka = d_a.template get_access(cgh); + auto kb = d_b.template get_access(cgh); + auto kc = d_c.template get_access(cgh); + cgh.parallel_for(range<1>{array_size}, [=](id<1> index){ + ka[index] = kb[index] + scalar * kc[index]; + }); + }); + queue.wait(); +} + +template +void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + auto _a = d_a.template get_access(); + auto _b = d_b.template get_access(); + auto _c = d_c.template get_access(); + for (int i = 0; i < array_size; i++) + { + _a[i] = a[i]; + _b[i] = b[i]; + _c[i] = c[i]; + } +} + +template +void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + auto _a = d_a.template get_access(); + auto _b = d_b.template get_access(); + auto _c = d_c.template get_access(); + for (int i = 0; i < array_size; i++) + { + a[i] = _a[i]; + b[i] = _b[i]; + c[i] = _c[i]; + } +} + +void listDevices(void) +{ + // TODO: Get actual list of devices + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + std::cout << "0: " << "triSYCL" << std::endl; + std::cout << std::endl; +} + +std::string getDeviceName(const int device) +{ + // TODO: Implement properly + return "triSYCL"; +} + +std::string getDeviceDriver(const int device) +{ + // TODO: Implement properly + return "triSCYL"; +} + + +template class SYCLStream; +template class SYCLStream; diff --git a/SYCLStream.h b/SYCLStream.h new file mode 100644 index 0000000..e10535e --- /dev/null +++ b/SYCLStream.h @@ -0,0 +1,45 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include "Stream.h" + +#include "CL/sycl.hpp" + +#define IMPLEMENTATION_STRING "SYCL" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // SYCL objects + cl::sycl::queue queue; + cl::sycl::buffer d_a; + cl::sycl::buffer d_b; + cl::sycl::buffer d_c; + + public: + + SYCLStream(const unsigned int, const int); + ~SYCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/main.cpp b/main.cpp index 1a92ebc..5d3a559 100644 --- a/main.cpp +++ b/main.cpp @@ -26,6 +26,12 @@ #include "RAJAStream.hpp" #elif defined(KOKKOS) #include "KOKKOSStream.hpp" +#elif defined(ACC) +#include "ACCStream.h" +#elif defined(SYCL) +#include "SYCLStream.h" +#elif defined(OMP3) +#include "OMP3Stream.h" #endif unsigned int ARRAY_SIZE = 52428800; @@ -91,6 +97,18 @@ void run() // Use the Kokkos implementation stream = new KOKKOSStream(ARRAY_SIZE, deviceIndex); +#elif defined(ACC) + // Use the OpenACC implementation + stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); + +#elif defined(SYCL) + // Use the SYCL implementation + stream = new SYCLStream(ARRAY_SIZE, deviceIndex); + +#elif defined(OMP3) + // Use the "reference" OpenMP 3 implementation + stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); + #endif stream->write_arrays(a, b, c); @@ -289,4 +307,3 @@ void parseArguments(int argc, char *argv[]) } } } -