Merging in changes from trunk

2016-05-05 17:23:47 +01:00 · 2016-05-05 17:23:47 +01:00 · 0a738efa54
commit 0a738efa54
parent 7c28a6386b f0afa0c1e4
8 changed files with 541 additions and 3 deletions
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@ -0,0 +1,136 @@
+
+#include "ACCStream.h"
+
+template <class T>
+ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
+{
+
+  acc_set_device_num(device, acc_device_nvidia);
+
+  array_size = ARRAY_SIZE;
+
+  // Set up data region on device
+  this->a = a;
+  this->b = b;
+  this->c = c;
+  #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+ACCStream<T>::~ACCStream()
+{
+  // End data region on device
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::copy()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *c = this->c;
+  #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::mul()
+{
+  const T scalar = 3.0;
+
+  unsigned int array_size = this->array_size;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::add()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::triad()
+{
+  const T scalar = 3.0;
+
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+void listDevices(void)
+{
+  // Get number of devices
+  int count = acc_get_num_devices(acc_device_nvidia);
+
+  // Print device list
+  if (count == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << "There are " << count << " devices." << std::endl;
+  }
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class ACCStream<float>;
+template class ACCStream<double>;
+
--- a/ACCStream.h
+++ b/ACCStream.h
@ -0,0 +1,39 @@
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#include <openacc.h>
+
+#define IMPLEMENTATION_STRING "OpenACC"
+
+template <class T>
+class ACCStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    ACCStream(const unsigned int, T*, T*, T*, int);
+    ~ACCStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+
+  
+};
+
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,12 +9,11 @@ project(gpu-stream)
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 0)

-list(APPEND CMAKE_CXX_FLAGS --std=c++11)
-
 configure_file(common.h.in common.h)

 find_package(CUDA 7.0 QUIET)
 if (${CUDA_FOUND})
+  list(APPEND CUDA_NVCC_FLAGS --std=c++11)
  cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
  target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
 else (${CUDA_FOUND})
@ -30,3 +29,20 @@ else (${OpenCL_FOUND})
  message("Skipping OpenCL...")
 endif (${OpenCL_FOUND})

+# TODO: Find OpenACC implementations somehow
+if (true)
+  add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
+  target_compile_definitions(gpu-stream-acc PUBLIC ACC)
+  target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11")
+else ()
+  message("Skipping OpenACC...")
+endif ()
+
+# TODO: Find SYCL implementations somehow
+if (true)
+  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
+  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
+  set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
+else ()
+  message("Skipping SYCL...")
+endif ()
--- a/OMP3Stream.cpp
+++ b/OMP3Stream.cpp
@ -0,0 +1,106 @@
+
+#include "OMP3Stream.h"
+
+template <class T>
+OMP3Stream<T>::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
+{
+  array_size = ARRAY_SIZE;
+  this->a = (T*)malloc(sizeof(T)*array_size);
+  this->b = (T*)malloc(sizeof(T)*array_size);
+  this->c = (T*)malloc(sizeof(T)*array_size);
+}
+
+template <class T>
+OMP3Stream<T>::~OMP3Stream()
+{
+  free(a);
+  free(b);
+  free(c);
+}
+
+
+template <class T>
+void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = h_a[i];
+    b[i] = h_b[i];
+    c[i] = h_c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    h_a[i] = a[i];
+    h_b[i] = b[i];
+    h_c[i] = c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::copy()
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::mul()
+{
+  const T scalar = 3.0;
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::add()
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::triad()
+{
+  const T scalar = 3.0;
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+
+void listDevices(void)
+{
+  std::cout << "0: CPU" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+
+
+template class OMP3Stream<float>;
+template class OMP3Stream<double>;
+
--- a/OMP3Stream.h
+++ b/OMP3Stream.h
@ -0,0 +1,35 @@
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "Reference OpenMP"
+
+template <class T>
+class OMP3Stream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    OMP3Stream(const unsigned int, T*, T*, T*);
+    ~OMP3Stream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@ -0,0 +1,144 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "SYCLStream.h"
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+template <class T>
+SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
+{
+  array_size = ARRAY_SIZE;
+
+  // Create buffers
+  d_a = buffer<T>(array_size);
+  d_b = buffer<T>(array_size);
+  d_c = buffer<T>(array_size);
+}
+
+template <class T>
+SYCLStream<T>::~SYCLStream()
+{
+}
+
+template <class T>
+void SYCLStream<T>::copy()
+{
+  queue.submit([&](handler &cgh)
+  {
+    auto ka = d_a.template get_access<access::read>(cgh);
+    auto kc = d_c.template get_access<access::write>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    {
+      kc[index] = ka[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::mul()
+{
+  const T scalar = 3.0;
+  queue.submit([&](handler &cgh)
+  {
+    auto kb = d_b.template get_access<access::write>(cgh);
+    auto kc = d_c.template get_access<access::read>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    {
+      kb[index] = scalar * kc[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::add()
+{
+  queue.submit([&](handler &cgh)
+  {
+    auto ka = d_a.template get_access<access::read>(cgh);
+    auto kb = d_b.template get_access<access::read>(cgh);
+    auto kc = d_c.template get_access<access::write>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    {
+      kc[index] = ka[index] + kb[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::triad()
+{
+  const T scalar = 3.0;
+  queue.submit([&](handler &cgh)
+  {
+    auto ka = d_a.template get_access<access::write>(cgh);
+    auto kb = d_b.template get_access<access::read>(cgh);
+    auto kc = d_c.template get_access<access::read>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index){
+      ka[index] = kb[index] + scalar * kc[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  auto _a = d_a.template get_access<access::write>();
+  auto _b = d_b.template get_access<access::write>();
+  auto _c = d_c.template get_access<access::write>();
+  for (int i = 0; i < array_size; i++)
+  {
+    _a[i] = a[i];
+    _b[i] = b[i];
+    _c[i] = c[i];
+  }
+}
+
+template <class T>
+void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  auto _a = d_a.template get_access<access::read>();
+  auto _b = d_b.template get_access<access::read>();
+  auto _c = d_c.template get_access<access::read>();
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = _a[i];
+    b[i] = _b[i];
+    c[i] = _c[i];
+  }
+}
+
+void listDevices(void)
+{
+  // TODO: Get actual list of devices
+  std::cout << std::endl;
+  std::cout << "Devices:" << std::endl;
+  std::cout << "0: " << "triSYCL" << std::endl;
+  std::cout << std::endl;
+}
+
+std::string getDeviceName(const int device)
+{
+  // TODO: Implement properly
+  return "triSYCL";
+}
+
+std::string getDeviceDriver(const int device)
+{
+  // TODO: Implement properly
+  return "triSCYL";
+}
+
+
+template class SYCLStream<float>;
+template class SYCLStream<double>;
--- a/SYCLStream.h
+++ b/SYCLStream.h
@ -0,0 +1,45 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include "Stream.h"
+
+#include "CL/sycl.hpp"
+
+#define IMPLEMENTATION_STRING "SYCL"
+
+template <class T>
+class SYCLStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // SYCL objects
+    cl::sycl::queue queue;
+    cl::sycl::buffer<T> d_a;
+    cl::sycl::buffer<T> d_b;
+    cl::sycl::buffer<T> d_c;
+
+  public:
+
+    SYCLStream(const unsigned int, const int);
+    ~SYCLStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
+// Populate the devices list
+void getDeviceList(void);
--- a/main.cpp
+++ b/main.cpp
@ -26,6 +26,12 @@
 #include "RAJAStream.hpp"
 #elif defined(KOKKOS)
 #include "KOKKOSStream.hpp"
+#elif defined(ACC)
+#include "ACCStream.h"
+#elif defined(SYCL)
+#include "SYCLStream.h"
+#elif defined(OMP3)
+#include "OMP3Stream.h"
 #endif

 unsigned int ARRAY_SIZE = 52428800;
@ -91,6 +97,18 @@ void run()
  // Use the Kokkos implementation
  stream = new KOKKOSStream<T>(ARRAY_SIZE, deviceIndex);

+#elif defined(ACC)
+  // Use the OpenACC implementation
+  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
+
+#elif defined(SYCL)
+  // Use the SYCL implementation
+  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(OMP3)
+  // Use the "reference" OpenMP 3 implementation
+  stream = new OMP3Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
+
 #endif

  stream->write_arrays(a, b, c);
@ -289,4 +307,3 @@ void parseArguments(int argc, char *argv[])
    }
  }
 }
-