From 2234841b16d897eb5ef0eb03e2746ebe9b62be48 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 14:08:59 +0100
Subject: [PATCH 001/117] Initial commit of new design with classes

---
 src/CUDAStream.cu | 29 +++++++++++++++++++++++++++++
 src/CUDAStream.h  | 13 +++++++++++++
 src/Stream.h      | 23 +++++++++++++++++++++++
 src/common.h      |  2 ++
 src/main.cpp      | 22 ++++++++++++++++++++++
 5 files changed, 89 insertions(+)
 create mode 100644 src/CUDAStream.cu
 create mode 100644 src/CUDAStream.h
 create mode 100644 src/Stream.h
 create mode 100644 src/common.h
 create mode 100644 src/main.cpp
diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
new file mode 100644
index 0000000..d95f9a1
--- /dev/null
+++ b/src/CUDAStream.cu
@@ -0,0 +1,29 @@
+
+#include "CUDAStream.h"
+
+template <class T>
+void CUDAStream<T>::copy()
+{
+  return;
+}
+
+template <class T>
+void CUDAStream<T>::mul()
+{
+  return;
+}
+
+template <class T>
+void CUDAStream<T>::add()
+{
+  return;
+}
+
+template <class T>
+void CUDAStream<T>::triad()
+{
+  return;
+}
+
+template class CUDAStream<float>;
+template class CUDAStream<double>;
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
new file mode 100644
index 0000000..bfea7ed
--- /dev/null
+++ b/src/CUDAStream.h
@@ -0,0 +1,13 @@
+
+
+#include "Stream.h"
+
+template <class T>
+class CUDAStream : public Stream<T>
+{
+  public:
+    void copy();
+    void add();
+    void mul();
+    void triad();
+};
diff --git a/src/Stream.h b/src/Stream.h
new file mode 100644
index 0000000..e283437
--- /dev/null
+++ b/src/Stream.h
@@ -0,0 +1,23 @@
+
+#pragma once
+
+#include <vector>
+
+template <class T>
+class Stream
+{
+  public:
+      // Kernels
+      // These must be blocking calls
+      virtual void copy() = 0;
+      virtual void mul() = 0;
+      virtual void add() = 0;
+      virtual void triad() = 0;
+
+
+      // Implementation specific device functions
+      static std::vector<int> getDeviceList();
+      static std::vector<int> getDeviceName();
+      static std::vector<int> getDeviceDriver();
+
+};
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..02d43a2
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,2 @@
+
+#define VERSION_STRING "2.0"
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..9e22b89
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,22 @@
+
+#include <iostream>
+
+#include "common.h"
+#include "Stream.h"
+#include "CUDAStream.h"
+
+
+
+int main(int argc, char *argv[])
+{
+  std::cout
+    << "GPU-STREAM" << std::endl
+    << "Version:" << VERSION_STRING << std::endl
+    << "Implementation: " << std::endl;
+
+  Stream<double> *stream;
+  stream = new CUDAStream<double>();
+
+  delete[] stream;
+
+}

From 1a259d4fc8e43ba67767951fa5438f1c173d6f36 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 14:24:04 +0100
Subject: [PATCH 002/117] Add a copy kernel

---
 src/CUDAStream.cu | 9 ++++++++-
 src/CUDAStream.h  | 7 +++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index d95f9a1..163a3f1 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -1,10 +1,17 @@
 
 #include "CUDAStream.h"
 
+template <typename T>
+__global__ void copy_kernel(const T * a, T * c)
+{
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  c[i] = a[i];
+}
+
 template <class T>
 void CUDAStream<T>::copy()
 {
-  return;
+  copy<<<1024, 1024>>>(a, c);
 }
 
 template <class T>
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index bfea7ed..ab9ac38 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -10,4 +10,11 @@ class CUDAStream : public Stream<T>
     void add();
     void mul();
     void triad();
+
+  private:
+    // Device side pointers to arrays
+    T *d_a;
+    T *d_b;
+    T *d_c;
+
 };

From 0bf68f99093d206689d00f667bc96b7010e4bc09 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 14:34:25 +0100
Subject: [PATCH 003/117] Make a copy kernel using the private variables

---
 src/CUDAStream.cu |  2 +-
 src/CUDAStream.h  | 13 +++++++------
 src/main.cpp      |  1 +
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 163a3f1..9b1b36d 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -11,7 +11,7 @@ __global__ void copy_kernel(const T * a, T * c)
 template <class T>
 void CUDAStream<T>::copy()
 {
-  copy<<<1024, 1024>>>(a, c);
+  copy_kernel<<<1024, 1024>>>(d_a, d_c);
 }
 
 template <class T>
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index ab9ac38..9348b29 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -5,16 +5,17 @@
 template <class T>
 class CUDAStream : public Stream<T>
 {
-  public:
-    void copy();
-    void add();
-    void mul();
-    void triad();
-
   private:
     // Device side pointers to arrays
     T *d_a;
     T *d_b;
     T *d_c;
 
+
+  public:
+    void copy();
+    void add();
+    void mul();
+    void triad();
+
 };
diff --git a/src/main.cpp b/src/main.cpp
index 9e22b89..76c3da4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,6 +16,7 @@ int main(int argc, char *argv[])
 
   Stream<double> *stream;
   stream = new CUDAStream<double>();
+  stream->copy();
 
   delete[] stream;
 

From 6169bdb7b5f887ba3ab8837f153bc00c272991b6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 14:40:49 +0100
Subject: [PATCH 004/117] Add some global variables

---
 src/main.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 76c3da4..6702e11 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,18 +1,28 @@
 
 #include <iostream>
+#include <vector>
 
 #include "common.h"
 #include "Stream.h"
 #include "CUDAStream.h"
 
 
+const unsigned int ARRAY_SIZE = 52428800;
+
+#define IMPLEMENTATION_STRING "CUDA"
 
 int main(int argc, char *argv[])
 {
   std::cout
     << "GPU-STREAM" << std::endl
     << "Version:" << VERSION_STRING << std::endl
-    << "Implementation: " << std::endl;
+    << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
+
+
+  // Create host vectors
+  std::vector<double> a(ARRAY_SIZE, 1.0);
+  std::vector<double> b(ARRAY_SIZE, 2.0);
+  std::vector<double> c(ARRAY_SIZE, 0.0);
 
   Stream<double> *stream;
   stream = new CUDAStream<double>();

From 03b01e190f2d3d174870ef5704b18e2f8e81bf49 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 14:49:04 +0100
Subject: [PATCH 005/117] Add cuda constructor declaration and error checking
 function

---
 src/CUDAStream.cu | 18 ++++++++++++++++++
 src/CUDAStream.h  |  4 ++++
 src/main.cpp      |  5 ++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 9b1b36d..4305ceb 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -1,6 +1,24 @@
 
 #include "CUDAStream.h"
 
+void check_error(void)
+{
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    std::cerr << "Error: " << cudaGetErrorString(err) << std::endl;
+    exit(err);
+  }
+}
+
+template <class T>
+CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE)
+{
+  // Create device buffers
+  cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
+   
+}
+
 template <typename T>
 __global__ void copy_kernel(const T * a, T * c)
 {
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 9348b29..49e76df 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -1,4 +1,5 @@
 
+#include <iostream>
 
 #include "Stream.h"
 
@@ -13,6 +14,9 @@ class CUDAStream : public Stream<T>
 
 
   public:
+
+    CUDAStream(const unsigned int);
+
     void copy();
     void add();
     void mul();
diff --git a/src/main.cpp b/src/main.cpp
index 6702e11..a293eb7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -25,7 +25,10 @@ int main(int argc, char *argv[])
   std::vector<double> c(ARRAY_SIZE, 0.0);
 
   Stream<double> *stream;
-  stream = new CUDAStream<double>();
+
+  // Use the CUDA implementation
+  stream = new CUDAStream<double>(ARRAY_SIZE);
+
   stream->copy();
 
   delete[] stream;

From ee4820b5e4763573211146a049ed392fb5390380 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 14:50:22 +0100
Subject: [PATCH 006/117] Create CUDA device buffers

---
 src/CUDAStream.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 4305ceb..66304d1 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -16,7 +16,11 @@ CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE)
 {
   // Create device buffers
   cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
-   
+  check_error();
+  cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T));
+  check_error();
+  cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T));
+  check_error();
 }
 
 template <typename T>

From ae679a57752ee1155a7093a7be4740758a9dc6d6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 14:50:58 +0100
Subject: [PATCH 007/117] Fix indentation in Stream.h

---
 src/Stream.h | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/Stream.h b/src/Stream.h
index e283437..3a3e826 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -7,17 +7,18 @@ template <class T>
 class Stream
 {
   public:
-      // Kernels
-      // These must be blocking calls
-      virtual void copy() = 0;
-      virtual void mul() = 0;
-      virtual void add() = 0;
-      virtual void triad() = 0;
+    // Kernels
+    // These must be blocking calls
+    virtual void copy() = 0;
+    virtual void mul() = 0;
+    virtual void add() = 0;
+    virtual void triad() = 0;
 
 
-      // Implementation specific device functions
-      static std::vector<int> getDeviceList();
-      static std::vector<int> getDeviceName();
-      static std::vector<int> getDeviceDriver();
+
+    // Implementation specific device functions
+    static std::vector<int> getDeviceList();
+    static std::vector<int> getDeviceName();
+    static std::vector<int> getDeviceDriver();
 
 };

From 8e534daf8bf72c3a1f9c775b8464f1bdf8a0adf7 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 15:02:41 +0100
Subject: [PATCH 008/117] Add methods to copy data between host and device

---
 src/CUDAStream.cu | 20 ++++++++++++++++++++
 src/CUDAStream.h  |  4 ++++
 src/Stream.h      |  4 +++-
 src/main.cpp      |  2 ++
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 66304d1..f325dfc 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -23,6 +23,23 @@ CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE)
   check_error();
 }
 
+
+template <class T>
+void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  // Copy host memory to device
+  cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice);
+  check_error();
+  cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice);
+  check_error();
+  cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
+  check_error();
+}
+
+template <class T>
+void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+}
 template <typename T>
 __global__ void copy_kernel(const T * a, T * c)
 {
@@ -34,6 +51,9 @@ template <class T>
 void CUDAStream<T>::copy()
 {
   copy_kernel<<<1024, 1024>>>(d_a, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
 }
 
 template <class T>
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 49e76df..881811e 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -22,4 +22,8 @@ class CUDAStream : public Stream<T>
     void mul();
     void triad();
 
+    void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c);
+    void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
+
 };
+
diff --git a/src/Stream.h b/src/Stream.h
index 3a3e826..bb83d69 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -14,7 +14,9 @@ class Stream
     virtual void add() = 0;
     virtual void triad() = 0;
 
-
+    // Copy memory between host and device
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
 
     // Implementation specific device functions
     static std::vector<int> getDeviceList();
diff --git a/src/main.cpp b/src/main.cpp
index a293eb7..cfb8884 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -29,6 +29,8 @@ int main(int argc, char *argv[])
   // Use the CUDA implementation
   stream = new CUDAStream<double>(ARRAY_SIZE);
 
+  stream->write_arrays(a, b, c);
+
   stream->copy();
 
   delete[] stream;

From c22b74ba47835656bc3fab309938ad02f46984eb Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 15:30:37 +0100
Subject: [PATCH 009/117] Add read_arrays definition for CUDA

---
 src/CUDAStream.cu | 9 +++++++++
 src/main.cpp      | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index f325dfc..5629b40 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -39,7 +39,16 @@ void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>&
 template <class T>
 void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
+  // Copy device memory to host
+  cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost);
+  check_error();
+  cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost);
+  check_error();
+  cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost);
+  check_error();
 }
+
+
 template <typename T>
 __global__ void copy_kernel(const T * a, T * c)
 {
diff --git a/src/main.cpp b/src/main.cpp
index cfb8884..6c2c0d8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -33,6 +33,9 @@ int main(int argc, char *argv[])
 
   stream->copy();
 
+  stream->read_arrays(a, b, c);
+  std::cout << c[105] << std::endl;
+
   delete[] stream;
 
 }

From dec0237353617bb6957bcb8b4a25c6587935354c Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 16:03:28 +0100
Subject: [PATCH 010/117] Add mul kernel

---
 src/CUDAStream.cu | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 5629b40..15b1b41 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -65,10 +65,21 @@ void CUDAStream<T>::copy()
   check_error();
 }
 
+template <typename T>
+__global__ void mul_kernel(T * b, const T * c)
+{
+  const T scalar = 3.0;
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  b[i] = scalar * c[i];
+}
+
 template <class T>
 void CUDAStream<T>::mul()
 {
-  return;
+  mul_kernel<<<1024, 1024>>>(d_b, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
 }
 
 template <class T>
@@ -85,3 +96,4 @@ void CUDAStream<T>::triad()
 
 template class CUDAStream<float>;
 template class CUDAStream<double>;
+

From 7a3a546a6e208743cedd148429430e2e9337fd32 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 16:06:17 +0100
Subject: [PATCH 011/117] Add mul CUDA kernel

---
 src/CUDAStream.cu | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 15b1b41..f08c40b 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -82,10 +82,20 @@ void CUDAStream<T>::mul()
   check_error();
 }
 
+template <typename T>
+__global__ void add_kernel(const T * a, const T * b, T * c)
+{
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  c[i] = a[i] + b[i];
+}
+
 template <class T>
 void CUDAStream<T>::add()
 {
-  return;
+  add_kernel<<<1024, 1024>>>(d_a, d_b, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
 }
 
 template <class T>

From 319e11011c896a953301b273bfdbb3bb37920d1e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 16:07:32 +0100
Subject: [PATCH 012/117] Add triad kernel

---
 src/CUDAStream.cu | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index f08c40b..18e1a70 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -98,10 +98,21 @@ void CUDAStream<T>::add()
   check_error();
 }
 
+template <typename T>
+__global__ void triad_kernel(T * a, const T * b, const T * c)
+{
+  const T scalar = 3.0;
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  a[i] = b[i] + scalar * c[i];
+}
+
 template <class T>
 void CUDAStream<T>::triad()
 {
-  return;
+  triad_kernel<<<1024, 1024>>>(d_a, d_b, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
 }
 
 template class CUDAStream<float>;

From 9c673317a7de31473ac4e4e4b720a2610ccfd4fb Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 26 Apr 2016 16:09:51 +0100
Subject: [PATCH 013/117] Store array size in class so can use it for kernel
 launches

---
 src/CUDAStream.cu | 10 ++++++----
 src/CUDAStream.h  |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 18e1a70..caf5e1a 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -14,6 +14,8 @@ void check_error(void)
 template <class T>
 CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE)
 {
+  array_size = ARRAY_SIZE;
+
   // Create device buffers
   cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
   check_error();
@@ -59,7 +61,7 @@ __global__ void copy_kernel(const T * a, T * c)
 template <class T>
 void CUDAStream<T>::copy()
 {
-  copy_kernel<<<1024, 1024>>>(d_a, d_c);
+  copy_kernel<<<array_size/1024, 1024>>>(d_a, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
@@ -76,7 +78,7 @@ __global__ void mul_kernel(T * b, const T * c)
 template <class T>
 void CUDAStream<T>::mul()
 {
-  mul_kernel<<<1024, 1024>>>(d_b, d_c);
+  mul_kernel<<<array_size/1024, 1024>>>(d_b, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
@@ -92,7 +94,7 @@ __global__ void add_kernel(const T * a, const T * b, T * c)
 template <class T>
 void CUDAStream<T>::add()
 {
-  add_kernel<<<1024, 1024>>>(d_a, d_b, d_c);
+  add_kernel<<<array_size/1024, 1024>>>(d_a, d_b, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
@@ -109,7 +111,7 @@ __global__ void triad_kernel(T * a, const T * b, const T * c)
 template <class T>
 void CUDAStream<T>::triad()
 {
-  triad_kernel<<<1024, 1024>>>(d_a, d_b, d_c);
+  triad_kernel<<<array_size/1024, 1024>>>(d_a, d_b, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 881811e..14f2cc2 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -7,6 +7,8 @@ template <class T>
 class CUDAStream : public Stream<T>
 {
   private:
+    // Size of arrays
+    unsigned int array_size;
     // Device side pointers to arrays
     T *d_a;
     T *d_b;

From 9730cd071e11f1149f81f30444f7600d8991fdbe Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 11:34:42 +0100
Subject: [PATCH 014/117] Overridden functions should have more keywords

---
 src/CUDAStream.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 14f2cc2..451cfc1 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -6,7 +6,7 @@
 template <class T>
 class CUDAStream : public Stream<T>
 {
-  private:
+  protected:
     // Size of arrays
     unsigned int array_size;
     // Device side pointers to arrays
@@ -19,10 +19,10 @@ class CUDAStream : public Stream<T>
 
     CUDAStream(const unsigned int);
 
-    void copy();
-    void add();
-    void mul();
-    void triad();
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
 
     void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c);
     void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);

From 6522d9114a14edb9c26c0d571a430c4fb2d63db8 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 11:35:04 +0100
Subject: [PATCH 015/117] Add new line at end of file

---
 src/Stream.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Stream.h b/src/Stream.h
index bb83d69..ecf043b 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -24,3 +24,4 @@ class Stream
     static std::vector<int> getDeviceDriver();
 
 };
+

From 6225ae90a7007914066087bb170cc9fc5f12614d Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 11:35:12 +0100
Subject: [PATCH 016/117] Add start of check results function

---
 src/main.cpp | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 6c2c0d8..355fac7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,6 +1,8 @@
 
 #include <iostream>
 #include <vector>
+#include <numeric>
+#include <cmath>
 
 #include "common.h"
 #include "Stream.h"
@@ -11,6 +13,9 @@ const unsigned int ARRAY_SIZE = 52428800;
 
 #define IMPLEMENTATION_STRING "CUDA"
 
+template <typename T>
+void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
+
 int main(int argc, char *argv[])
 {
   std::cout
@@ -32,10 +37,41 @@ int main(int argc, char *argv[])
   stream->write_arrays(a, b, c);
 
   stream->copy();
+  stream->mul();
+  stream->add();
+  stream->triad();
 
   stream->read_arrays(a, b, c);
-  std::cout << c[105] << std::endl;
+  std::cout << a[105] << std::endl;
+
+  check_solution<double>(1, a, b, c);
 
   delete[] stream;
 
 }
+
+template <typename T>
+void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  // Generate correct solution
+  T goldA = 1.0;
+  T goldB = 2.0;
+  T goldC = 0.0;
+
+  const T scalar = 3.0;
+
+  for (unsigned int i = 0; i < ntimes; i++)
+  {
+    // Do STREAM!
+    goldC = goldA;
+    goldB = scalar * goldC;
+    goldC = goldA + goldB;
+    goldA = goldB + scalar * goldC;
+  }
+
+  // Calculate the average error
+  double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
+  
+
+}
+

From 9aa27cd91dae5e93d34b89e8e3ca8e4cd8a8a6ce Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 11:42:23 +0100
Subject: [PATCH 017/117] Print out average error on check if there is an error

---
 src/main.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 355fac7..d45a4a5 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -3,6 +3,7 @@
 #include <vector>
 #include <numeric>
 #include <cmath>
+#include <limits>
 
 #include "common.h"
 #include "Stream.h"
@@ -71,7 +72,26 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
 
   // Calculate the average error
   double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
-  
+  errA /= a.size();
+  double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
+  errB /= b.size();
+  double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
+  errC /= c.size();
+
+  double epsi = std::numeric_limits<T>::epsilon() * 100.0;
+
+  if (errA > epsi)
+    std::cerr
+      << "Validation failed on a[]. Average error " << errA
+      << std::endl;
+  if (errB > epsi)
+    std::cerr
+      << "Validation failed on b[]. Average error " << errB
+      << std::endl;
+  if (errC > epsi)
+    std::cerr
+      << "Validation failed on c[]. Average error " << errC
+      << std::endl;
 
 }
 

From 40c787d0409d5f771cabb29105738752e8f49e2d Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 11:52:15 +0100
Subject: [PATCH 018/117] Check bufers fit on CUDA device

---
 src/CUDAStream.cu | 6 ++++++
 src/CUDAStream.h  | 1 +
 2 files changed, 7 insertions(+)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index caf5e1a..5ec1e67 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -16,6 +16,12 @@ CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE)
 {
   array_size = ARRAY_SIZE;
 
+  // Check buffers fit on the device
+  cudaDeviceProp props;
+  cudaGetDeviceProperties(&props, 0);
+  if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
+    throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
   // Create device buffers
   cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
   check_error();
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 451cfc1..bde574e 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -1,5 +1,6 @@
 
 #include <iostream>
+#include <stdexcept>
 
 #include "Stream.h"
 

From c28e70ae707abb5a3f22a8d1011c8e39caa28913 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 12:08:49 +0100
Subject: [PATCH 019/117] Add timers and run multiple times

---
 src/main.cpp | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index d45a4a5..7dab5c7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -4,6 +4,7 @@
 #include <numeric>
 #include <cmath>
 #include <limits>
+#include <chrono>
 
 #include "common.h"
 #include "Stream.h"
@@ -11,6 +12,7 @@
 
 
 const unsigned int ARRAY_SIZE = 52428800;
+const unsigned int ntimes = 10;
 
 #define IMPLEMENTATION_STRING "CUDA"
 
@@ -37,15 +39,44 @@ int main(int argc, char *argv[])
 
   stream->write_arrays(a, b, c);
 
-  stream->copy();
-  stream->mul();
-  stream->add();
-  stream->triad();
+  // List of times
+  std::vector< std::vector<double> > timings;
 
+  // Declare timers
+  std::chrono::high_resolution_clock::time_point t1, t2;
+
+  // Main loop
+  for (unsigned int k = 0; k < ntimes; k++)
+  {
+    std::vector<double> times;
+
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->copy();
+    t2 = std::chrono::high_resolution_clock::now();
+    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->mul();
+    t2 = std::chrono::high_resolution_clock::now();
+    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->add();
+    t2 = std::chrono::high_resolution_clock::now();
+    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->triad();
+    t2 = std::chrono::high_resolution_clock::now();
+    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+    timings.push_back(times);
+
+  }
+
+  // Check solutions
   stream->read_arrays(a, b, c);
-  std::cout << a[105] << std::endl;
-
-  check_solution<double>(1, a, b, c);
+  check_solution<double>(ntimes, a, b, c);
 
   delete[] stream;
 

From 3d5a49317e2ca4107bb91369d9abf5d773701134 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 12:11:19 +0100
Subject: [PATCH 020/117] Free CUDA buffers in destructor

---
 src/CUDAStream.cu | 11 +++++++++++
 src/CUDAStream.h  |  1 +
 2 files changed, 12 insertions(+)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 5ec1e67..7c9afb3 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -32,6 +32,17 @@ CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE)
 }
 
 
+template <class T>
+CUDAStream<T>::~CUDAStream()
+{
+  cudaFree(d_a);
+  check_error();
+  cudaFree(d_b);
+  check_error();
+  cudaFree(d_c);
+  check_error();
+}
+
 template <class T>
 void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index bde574e..89ad536 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -19,6 +19,7 @@ class CUDAStream : public Stream<T>
   public:
 
     CUDAStream(const unsigned int);
+    ~CUDAStream();
 
     virtual void copy() override;
     virtual void add() override;

From daa7f643b96a010cf8bc4dc63e2176205090ae78 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 27 Apr 2016 13:18:06 +0100
Subject: [PATCH 021/117] Print out timing results

---
 src/main.cpp | 80 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 72 insertions(+), 8 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 7dab5c7..50bc51f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -5,6 +5,8 @@
 #include <cmath>
 #include <limits>
 #include <chrono>
+#include <algorithm>
+#include <iomanip>
 
 #include "common.h"
 #include "Stream.h"
@@ -40,7 +42,10 @@ int main(int argc, char *argv[])
   stream->write_arrays(a, b, c);
 
   // List of times
-  std::vector< std::vector<double> > timings;
+  std::vector<double> copy_timings;
+  std::vector<double> mul_timings;
+  std::vector<double> add_timings;
+  std::vector<double> triad_timings;
 
   // Declare timers
   std::chrono::high_resolution_clock::time_point t1, t2;
@@ -48,29 +53,30 @@ int main(int argc, char *argv[])
   // Main loop
   for (unsigned int k = 0; k < ntimes; k++)
   {
-    std::vector<double> times;
 
+    // Execute Copy
     t1 = std::chrono::high_resolution_clock::now();
     stream->copy();
     t2 = std::chrono::high_resolution_clock::now();
-    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    copy_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
+    // Execute Mul
     t1 = std::chrono::high_resolution_clock::now();
     stream->mul();
     t2 = std::chrono::high_resolution_clock::now();
-    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    mul_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
+    // Execute Add
     t1 = std::chrono::high_resolution_clock::now();
     stream->add();
     t2 = std::chrono::high_resolution_clock::now();
-    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    add_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
+    // Execute Triad
     t1 = std::chrono::high_resolution_clock::now();
     stream->triad();
     t2 = std::chrono::high_resolution_clock::now();
-    times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-    timings.push_back(times);
+    triad_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
   }
 
@@ -78,6 +84,64 @@ int main(int argc, char *argv[])
   stream->read_arrays(a, b, c);
   check_solution<double>(ntimes, a, b, c);
 
+  // Crunch timing results
+
+  // Get min/max; ignore first result
+  auto copy_minmax = std::minmax_element(copy_timings.begin()+1, copy_timings.end());
+  auto mul_minmax = std::minmax_element(mul_timings.begin()+1, mul_timings.end());
+  auto add_minmax = std::minmax_element(add_timings.begin()+1, add_timings.end());
+  auto triad_minmax = std::minmax_element(triad_timings.begin()+1, triad_timings.end());
+
+  double copy_average = std::accumulate(copy_timings.begin()+1, copy_timings.end(), 0.0) / (double)(ntimes - 1);
+  double mul_average = std::accumulate(mul_timings.begin()+1, mul_timings.end(), 0.0) / (double)(ntimes - 1);
+  double add_average = std::accumulate(add_timings.begin()+1, add_timings.end(), 0.0) / (double)(ntimes - 1);
+  double triad_average = std::accumulate(triad_timings.begin()+1, triad_timings.end(), 0.0) / (double)(ntimes - 1);
+
+
+  // Display results
+  std::cout
+    << std::left << std::setw(12) << "Function"
+    << std::left << std::setw(12) << "MBytes/sec"
+    << std::left << std::setw(12) << "Min (sec)"
+    << std::left << std::setw(12) << "Max"
+    << std::left << std::setw(12) << "Average" << std::endl;
+
+  std::cout << std::fixed;
+
+  std::cout
+    << std::left << std::setw(12) << "Copy"
+    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*copy_minmax.first)
+    << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.first
+    << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.second
+    << std::left << std::setw(12) << std::setprecision(5) << copy_average
+    << std::endl;
+
+
+  std::cout
+    << std::left << std::setw(12) << "Mul"
+    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*mul_minmax.first)
+    << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.first
+    << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.second
+    << std::left << std::setw(12) << std::setprecision(5) << mul_average
+    << std::endl;
+
+
+  std::cout
+    << std::left << std::setw(12) << "Add"
+    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*add_minmax.first)
+    << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.first
+    << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.second
+    << std::left << std::setw(12) << std::setprecision(5) << add_average
+    << std::endl;
+
+  std::cout
+    << std::left << std::setw(12) << "Triad"
+    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*triad_minmax.first)
+    << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.first
+    << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.second
+    << std::left << std::setw(12) << std::setprecision(5) << triad_average
+    << std::endl;
+
   delete[] stream;
 
 }

From 377b3487482f368f0d38199e2a3ca071cf60b690 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 11:15:25 +0100
Subject: [PATCH 022/117] Move implementation string to the common header file

---
 src/common.h | 2 ++
 src/main.cpp | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/common.h b/src/common.h
index 02d43a2..c4bdadb 100644
--- a/src/common.h
+++ b/src/common.h
@@ -1,2 +1,4 @@
 
 #define VERSION_STRING "2.0"
+#define IMPLEMENTATION_STRING "CUDA"
+
diff --git a/src/main.cpp b/src/main.cpp
index 50bc51f..71cb478 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,7 +16,6 @@
 const unsigned int ARRAY_SIZE = 52428800;
 const unsigned int ntimes = 10;
 
-#define IMPLEMENTATION_STRING "CUDA"
 
 template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
@@ -25,7 +24,7 @@ int main(int argc, char *argv[])
 {
   std::cout
     << "GPU-STREAM" << std::endl
-    << "Version:" << VERSION_STRING << std::endl
+    << "Version: " << VERSION_STRING << std::endl
     << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
 
 

From 8d88afdedb52fd9a19c4a76ff6eae6fead5f0e24 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 11:57:09 +0100
Subject: [PATCH 023/117] Tidy up timing printing to reduce code duplication

---
 src/main.cpp | 82 +++++++++++++++++++---------------------------------
 1 file changed, 29 insertions(+), 53 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 71cb478..42dbe28 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -41,10 +41,7 @@ int main(int argc, char *argv[])
   stream->write_arrays(a, b, c);
 
   // List of times
-  std::vector<double> copy_timings;
-  std::vector<double> mul_timings;
-  std::vector<double> add_timings;
-  std::vector<double> triad_timings;
+  std::vector<std::vector<double>> timings(4);
 
   // Declare timers
   std::chrono::high_resolution_clock::time_point t1, t2;
@@ -52,30 +49,29 @@ int main(int argc, char *argv[])
   // Main loop
   for (unsigned int k = 0; k < ntimes; k++)
   {
-
     // Execute Copy
     t1 = std::chrono::high_resolution_clock::now();
     stream->copy();
     t2 = std::chrono::high_resolution_clock::now();
-    copy_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
     // Execute Mul
     t1 = std::chrono::high_resolution_clock::now();
     stream->mul();
     t2 = std::chrono::high_resolution_clock::now();
-    mul_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
     // Execute Add
     t1 = std::chrono::high_resolution_clock::now();
     stream->add();
     t2 = std::chrono::high_resolution_clock::now();
-    add_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
     // Execute Triad
     t1 = std::chrono::high_resolution_clock::now();
     stream->triad();
     t2 = std::chrono::high_resolution_clock::now();
-    triad_timings.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+    timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
   }
 
@@ -83,21 +79,7 @@ int main(int argc, char *argv[])
   stream->read_arrays(a, b, c);
   check_solution<double>(ntimes, a, b, c);
 
-  // Crunch timing results
-
-  // Get min/max; ignore first result
-  auto copy_minmax = std::minmax_element(copy_timings.begin()+1, copy_timings.end());
-  auto mul_minmax = std::minmax_element(mul_timings.begin()+1, mul_timings.end());
-  auto add_minmax = std::minmax_element(add_timings.begin()+1, add_timings.end());
-  auto triad_minmax = std::minmax_element(triad_timings.begin()+1, triad_timings.end());
-
-  double copy_average = std::accumulate(copy_timings.begin()+1, copy_timings.end(), 0.0) / (double)(ntimes - 1);
-  double mul_average = std::accumulate(mul_timings.begin()+1, mul_timings.end(), 0.0) / (double)(ntimes - 1);
-  double add_average = std::accumulate(add_timings.begin()+1, add_timings.end(), 0.0) / (double)(ntimes - 1);
-  double triad_average = std::accumulate(triad_timings.begin()+1, triad_timings.end(), 0.0) / (double)(ntimes - 1);
-
-
-  // Display results
+  // Display timing results
   std::cout
     << std::left << std::setw(12) << "Function"
     << std::left << std::setw(12) << "MBytes/sec"
@@ -107,39 +89,33 @@ int main(int argc, char *argv[])
 
   std::cout << std::fixed;
 
-  std::cout
-    << std::left << std::setw(12) << "Copy"
-    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*copy_minmax.first)
-    << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.first
-    << std::left << std::setw(12) << std::setprecision(5) << *copy_minmax.second
-    << std::left << std::setw(12) << std::setprecision(5) << copy_average
-    << std::endl;
+  std::string labels[4] = {"Copy", "Mul", "Add", "Triad"};
+  size_t sizes[4] = {
+    2 * sizeof(double) * ARRAY_SIZE,
+    2 * sizeof(double) * ARRAY_SIZE,
+    3 * sizeof(double) * ARRAY_SIZE,
+    3 * sizeof(double) * ARRAY_SIZE
+  };
 
+  for (int i = 0; i < 4; i++)
+  {
+    // Get min/max; ignore the first result
+    auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
 
-  std::cout
-    << std::left << std::setw(12) << "Mul"
-    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (2 * sizeof(double) * ARRAY_SIZE)/(*mul_minmax.first)
-    << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.first
-    << std::left << std::setw(12) << std::setprecision(5) << *mul_minmax.second
-    << std::left << std::setw(12) << std::setprecision(5) << mul_average
-    << std::endl;
+    // Calculate average; ignore the first result
+    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(ntimes - 1);
 
+    // Display results
+    std::cout
+      << std::left << std::setw(12) << labels[i]
+      << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first)
+      << std::left << std::setw(12) << std::setprecision(5) << *minmax.first
+      << std::left << std::setw(12) << std::setprecision(5) << *minmax.second
+      << std::left << std::setw(12) << std::setprecision(5) << average
+      << std::endl;
+    
 
-  std::cout
-    << std::left << std::setw(12) << "Add"
-    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*add_minmax.first)
-    << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.first
-    << std::left << std::setw(12) << std::setprecision(5) << *add_minmax.second
-    << std::left << std::setw(12) << std::setprecision(5) << add_average
-    << std::endl;
-
-  std::cout
-    << std::left << std::setw(12) << "Triad"
-    << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * (3 * sizeof(double) * ARRAY_SIZE)/(*triad_minmax.first)
-    << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.first
-    << std::left << std::setw(12) << std::setprecision(5) << *triad_minmax.second
-    << std::left << std::setw(12) << std::setprecision(5) << triad_average
-    << std::endl;
+  }
 
   delete[] stream;
 

From 59fe9738b654b7cd2ad96e856ee7a0232a73cb76 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 12:03:50 +0100
Subject: [PATCH 024/117] Add a templated run function to make double/float
 switch easy

---
 src/main.cpp | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 42dbe28..d09c3c9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -20,6 +20,9 @@ const unsigned int ntimes = 10;
 template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
 
+template <typename T>
+void run();
+
 int main(int argc, char *argv[])
 {
   std::cout
@@ -27,16 +30,22 @@ int main(int argc, char *argv[])
     << "Version: " << VERSION_STRING << std::endl
     << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
 
+    run<double>();
 
+}
+
+template <typename T>
+void run()
+{
   // Create host vectors
-  std::vector<double> a(ARRAY_SIZE, 1.0);
-  std::vector<double> b(ARRAY_SIZE, 2.0);
-  std::vector<double> c(ARRAY_SIZE, 0.0);
+  std::vector<T> a(ARRAY_SIZE, 1.0);
+  std::vector<T> b(ARRAY_SIZE, 2.0);
+  std::vector<T> c(ARRAY_SIZE, 0.0);
 
-  Stream<double> *stream;
+  Stream<T> *stream;
 
   // Use the CUDA implementation
-  stream = new CUDAStream<double>(ARRAY_SIZE);
+  stream = new CUDAStream<T>(ARRAY_SIZE);
 
   stream->write_arrays(a, b, c);
 
@@ -77,7 +86,7 @@ int main(int argc, char *argv[])
 
   // Check solutions
   stream->read_arrays(a, b, c);
-  check_solution<double>(ntimes, a, b, c);
+  check_solution<T>(ntimes, a, b, c);
 
   // Display timing results
   std::cout
@@ -91,10 +100,10 @@ int main(int argc, char *argv[])
 
   std::string labels[4] = {"Copy", "Mul", "Add", "Triad"};
   size_t sizes[4] = {
-    2 * sizeof(double) * ARRAY_SIZE,
-    2 * sizeof(double) * ARRAY_SIZE,
-    3 * sizeof(double) * ARRAY_SIZE,
-    3 * sizeof(double) * ARRAY_SIZE
+    2 * sizeof(T) * ARRAY_SIZE,
+    2 * sizeof(T) * ARRAY_SIZE,
+    3 * sizeof(T) * ARRAY_SIZE,
+    3 * sizeof(T) * ARRAY_SIZE
   };
 
   for (int i = 0; i < 4; i++)

From a745ffc72413ae419afa997d780dda4b47266e3a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 12:07:09 +0100
Subject: [PATCH 025/117] Add more keywords to CUDA header

---
 src/CUDAStream.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 89ad536..e6505c6 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -26,8 +26,8 @@ class CUDAStream : public Stream<T>
     virtual void mul() override;
     virtual void triad() override;
 
-    void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c);
-    void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 
 };
 

From 38e1e3b704a68e01cfcdb552d05f2acd44133578 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 12:59:14 +0100
Subject: [PATCH 026/117] Add starts of OpenCL implementation

---
 src/OCLStream.cpp | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 src/OCLStream.h   | 43 ++++++++++++++++++++++++++
 src/main.cpp      |  8 +++--
 3 files changed, 126 insertions(+), 2 deletions(-)
 create mode 100644 src/OCLStream.cpp
 create mode 100644 src/OCLStream.h

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
new file mode 100644
index 0000000..eac19c0
--- /dev/null
+++ b/src/OCLStream.cpp
@@ -0,0 +1,77 @@
+
+#include "OCLStream.h"
+
+template <class T>
+OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE)
+{
+  array_size = ARRAY_SIZE;
+
+  // Setup default OpenCL GPU
+  context = cl::Context::getDefault();
+  //queue = cl::CommandQueue::getDefault();
+
+  // Create program
+
+  std::string kernels{R"CLC(
+
+    const double scalar = 3.0;
+
+    kernel void copy(
+      global const double * restrict a,
+      global double * restrict c)
+    {
+      const size_t i = get_global_id(0);
+      c[i] = a[i];
+    }
+  )CLC"};
+ 
+std::cout << kernels << std::endl;
+ 
+  //cl::Program program(kernels);
+  //program.build();
+
+exit(-1);
+
+
+}
+
+template <class T>
+void OCLStream<T>::copy()
+{
+  return;
+}
+
+template <class T>
+void OCLStream<T>::mul()
+{
+  return;
+}
+
+template <class T>
+void OCLStream<T>::add()
+{
+  return;
+}
+
+template <class T>
+void OCLStream<T>::triad()
+{
+  return;
+}
+
+template <class T>
+void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  return;
+}
+
+template <class T>
+void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  return;
+}
+
+
+template class OCLStream<float>;
+template class OCLStream<double>;
+
diff --git a/src/OCLStream.h b/src/OCLStream.h
new file mode 100644
index 0000000..a5cac61
--- /dev/null
+++ b/src/OCLStream.h
@@ -0,0 +1,43 @@
+
+#include <iostream>
+#include <stdexcept>
+
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+
+#include "cl2.hpp"
+
+#include "Stream.h"
+
+template <class T>
+class OCLStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Device side pointers to arrays
+    cl::Buffer d_a;
+    cl::Buffer d_b;
+    cl::Buffer d_c;
+
+    // OpenCL objects
+    cl::Context context;
+    cl::CommandQueue queue;
+
+  public:
+
+    OCLStream(const unsigned int);
+    ~OCLStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
diff --git a/src/main.cpp b/src/main.cpp
index d09c3c9..39cfe86 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,7 +10,8 @@
 
 #include "common.h"
 #include "Stream.h"
-#include "CUDAStream.h"
+
+#include "OCLStream.h"
 
 
 const unsigned int ARRAY_SIZE = 52428800;
@@ -45,7 +46,10 @@ void run()
   Stream<T> *stream;
 
   // Use the CUDA implementation
-  stream = new CUDAStream<T>(ARRAY_SIZE);
+  //stream = new CUDAStream<T>(ARRAY_SIZE);
+
+  // Use the OpenCL implementation
+  stream = new OCLStream<T>(ARRAY_SIZE);
 
   stream->write_arrays(a, b, c);
 

From eeaf9358ab5cb39e8437fce34c1ff4b3a2275061 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 15:01:43 +0100
Subject: [PATCH 027/117] Create OCL kernel functors

---
 src/OCLStream.cpp | 80 +++++++++++++++++++++++++++++++++++------------
 src/OCLStream.h   |  5 +++
 2 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index eac19c0..f60c161 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -1,6 +1,47 @@
 
 #include "OCLStream.h"
 
+std::string kernels{R"CLC(
+
+  constant TYPE scalar = 3.0;
+
+  kernel void copy(
+    global const TYPE * restrict a,
+    global TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    c[i] = a[i];
+  }
+
+  kernel void mul(
+    global TYPE * restrict b,
+    global const TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    b[i] = scalar * c[i];
+  }
+
+  kernel void add(
+    global const TYPE * restrict a,
+    global const TYPE * restrict b,
+    global TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    c[i] = a[i] + b[i];
+  }
+
+  kernel void triad(
+    global TYPE * restrict a,
+    global const TYPE * restrict b,
+    global const TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    a[i] = b[i] + scalar * c[i];
+  }
+
+)CLC"};
+
+
 template <class T>
 OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE)
 {
@@ -8,31 +49,30 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE)
 
   // Setup default OpenCL GPU
   context = cl::Context::getDefault();
-  //queue = cl::CommandQueue::getDefault();
+  queue = cl::CommandQueue::getDefault();
 
   // Create program
+  cl::Program program(kernels);
+  if (sizeof(T) == sizeof(double))
+    program.build("-DTYPE=double");
+  else if (sizeof(T) == sizeof(float))
+    program.build("-DTYPE=float");
 
-  std::string kernels{R"CLC(
-
-    const double scalar = 3.0;
-
-    kernel void copy(
-      global const double * restrict a,
-      global double * restrict c)
-    {
-      const size_t i = get_global_id(0);
-      c[i] = a[i];
-    }
-  )CLC"};
- 
-std::cout << kernels << std::endl;
- 
-  //cl::Program program(kernels);
-  //program.build();
-
-exit(-1);
+  // Create kernels
+  copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
+  mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
+  add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
+  triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
 
+}
 
+template <class T>
+OCLStream<T>::~OCLStream()
+{
+  delete[] copy_kernel;
+  delete[] mul_kernel;
+  delete[] add_kernel;
+  delete[] triad_kernel;
 }
 
 template <class T>
diff --git a/src/OCLStream.h b/src/OCLStream.h
index a5cac61..28a9be1 100644
--- a/src/OCLStream.h
+++ b/src/OCLStream.h
@@ -26,6 +26,11 @@ class OCLStream : public Stream<T>
     cl::Context context;
     cl::CommandQueue queue;
 
+    cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
+    cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
+    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
+    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *triad_kernel;
+
   public:
 
     OCLStream(const unsigned int);

From 77f6df856cceab0c2ac99bad1fd24669d54b84c7 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 15:05:01 +0100
Subject: [PATCH 028/117] Call kernels in OCL

---
 src/OCLStream.cpp | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index f60c161..4aa2da6 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -78,25 +78,41 @@ OCLStream<T>::~OCLStream()
 template <class T>
 void OCLStream<T>::copy()
 {
-  return;
+  (*copy_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_a, d_c
+  );
+  queue.finish();
 }
 
 template <class T>
 void OCLStream<T>::mul()
 {
-  return;
+  (*mul_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_b, d_c
+  );
+  queue.finish();
 }
 
 template <class T>
 void OCLStream<T>::add()
 {
-  return;
+  (*add_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_a, d_b, d_c
+  );
+  queue.finish();
 }
 
 template <class T>
 void OCLStream<T>::triad()
 {
-  return;
+  (*triad_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_a, d_b, d_c
+  );
+  queue.finish();
 }
 
 template <class T>

From b5149691935822904b38a7749e39777feddf1cd6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 15:08:12 +0100
Subject: [PATCH 029/117] Create OCL device buffers

---
 src/OCLStream.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index 4aa2da6..9e27aa9 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -64,6 +64,11 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE)
   add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
   triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
 
+  // Create buffers
+  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+
 }
 
 template <class T>
@@ -118,6 +123,7 @@ void OCLStream<T>::triad()
 template <class T>
 void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
+  
   return;
 }
 

From 088778977b3719e3658de91d1d65fdddb2a5844a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 15:11:02 +0100
Subject: [PATCH 030/117] Add OCL copy functions

---
 src/OCLStream.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index 9e27aa9..e5145b4 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -123,14 +123,17 @@ void OCLStream<T>::triad()
 template <class T>
 void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
-  
-  return;
+  cl::copy(a.begin(), a.end(), d_a);
+  cl::copy(b.begin(), b.end(), d_b);
+  cl::copy(c.begin(), c.end(), d_c);
 }
 
 template <class T>
 void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
-  return;
+  cl::copy(d_a, a.begin(), a.end());
+  cl::copy(d_b, b.begin(), b.end());
+  cl::copy(d_c, c.begin(), c.end());
 }
 
 

From b9e70e11ab34dbeed383ca145f1a79cfc0bfcc43 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 16:58:32 +0100
Subject: [PATCH 031/117] Add CMakeLists.txt file with CUDA and OCL builds

---
 CMakeLists.txt | 19 +++++++++++++++++++
 src/main.cpp   | 10 +++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..5e1abd6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,19 @@
+
+cmake_minimum_required(VERSION 2.8)
+
+project(gpu-stream)
+
+set(CMAKE_SOURCE_DIR src/)
+
+list(APPEND CMAKE_CXX_FLAGS --std=c++11)
+
+find_package(CUDA REQUIRED)
+
+cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu)
+target_compile_definitions(cuda.exe PUBLIC CUDA)
+
+find_package(OpenCL)
+add_executable(ocl.exe src/main.cpp src/OCLStream.cpp)
+target_compile_definitions(ocl.exe PUBLIC OCL)
+target_link_libraries(ocl.exe ${OpenCL_LIBRARY})
+
diff --git a/src/main.cpp b/src/main.cpp
index 39cfe86..008a6a2 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,7 +11,11 @@
 #include "common.h"
 #include "Stream.h"
 
+#if defined(CUDA)
+#include "CUDAStream.h"
+#elif defined(OCL)
 #include "OCLStream.h"
+#endif
 
 
 const unsigned int ARRAY_SIZE = 52428800;
@@ -45,12 +49,16 @@ void run()
 
   Stream<T> *stream;
 
+#if defined(CUDA)
   // Use the CUDA implementation
-  //stream = new CUDAStream<T>(ARRAY_SIZE);
+  stream = new CUDAStream<T>(ARRAY_SIZE);
 
+#elif defined(OCL)
   // Use the OpenCL implementation
   stream = new OCLStream<T>(ARRAY_SIZE);
 
+#endif
+
   stream->write_arrays(a, b, c);
 
   // List of times

From 7006871cbe0beda6c9c444c495f34dff23af696e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 17:10:14 +0100
Subject: [PATCH 032/117] Get version from CMake configued header and only
 build implementations which have the runtime around

---
 CMakeLists.txt  | 20 ++++++++++++++++----
 src/common.h    |  4 ----
 src/common.h.in |  4 ++++
 3 files changed, 20 insertions(+), 8 deletions(-)
 delete mode 100644 src/common.h
 create mode 100644 src/common.h.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e1abd6..5bb8a4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,17 +3,29 @@ cmake_minimum_required(VERSION 2.8)
 
 project(gpu-stream)
 
-set(CMAKE_SOURCE_DIR src/)
+set(gpu-stream_VERSION_MAJOR 2)
+set(gpu-stream_VERSION_MINOR 0)
 
 list(APPEND CMAKE_CXX_FLAGS --std=c++11)
 
-find_package(CUDA REQUIRED)
-
+find_package(CUDA QUIET)
+if (${CUDA_FOUND})
+set(IMPLEMENTATION CUDA)
+configure_file(src/common.h.in src/common_cuda.h)
 cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu)
 target_compile_definitions(cuda.exe PUBLIC CUDA)
+else (${CUDA_FOUND})
+message("Skipping CUDA...")
+endif (${CUDA_FOUND})
 
-find_package(OpenCL)
+find_package(OpenCL QUIET)
+if (${OpenCL_FOUND})
+set(gpu-stream_IMPLEMENTATION OpenCL)
+configure_file(src/common.h.in src/common_ocl.h)
 add_executable(ocl.exe src/main.cpp src/OCLStream.cpp)
 target_compile_definitions(ocl.exe PUBLIC OCL)
 target_link_libraries(ocl.exe ${OpenCL_LIBRARY})
+else (${OpenCL_FOUND})
+message("Skipping OpenCL...")
+endif (${OpenCL_FOUND})
 
diff --git a/src/common.h b/src/common.h
deleted file mode 100644
index c4bdadb..0000000
--- a/src/common.h
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#define VERSION_STRING "2.0"
-#define IMPLEMENTATION_STRING "CUDA"
-
diff --git a/src/common.h.in b/src/common.h.in
new file mode 100644
index 0000000..44dc040
--- /dev/null
+++ b/src/common.h.in
@@ -0,0 +1,4 @@
+
+#define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@"
+#define IMPLEMENTATION_STRING "@gpu-stream_IMPLEMENTATION@"
+

From a1cab96c57ecbf9c68dd7623251b8cbd7e80417d Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 17:20:40 +0100
Subject: [PATCH 033/117] Define the implementaiton strings in each
 implementation header

---
 CMakeLists.txt   | 20 +++++++++-----------
 src/CUDAStream.h |  2 ++
 src/OCLStream.h  |  2 ++
 src/common.h.in  |  1 -
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5bb8a4b..8822353 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,24 +8,22 @@ set(gpu-stream_VERSION_MINOR 0)
 
 list(APPEND CMAKE_CXX_FLAGS --std=c++11)
 
+configure_file(src/common.h.in src/common.h)
+
 find_package(CUDA QUIET)
 if (${CUDA_FOUND})
-set(IMPLEMENTATION CUDA)
-configure_file(src/common.h.in src/common_cuda.h)
-cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu)
-target_compile_definitions(cuda.exe PUBLIC CUDA)
+  cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu)
+  target_compile_definitions(cuda.exe PUBLIC CUDA)
 else (${CUDA_FOUND})
-message("Skipping CUDA...")
+  message("Skipping CUDA...")
 endif (${CUDA_FOUND})
 
 find_package(OpenCL QUIET)
 if (${OpenCL_FOUND})
-set(gpu-stream_IMPLEMENTATION OpenCL)
-configure_file(src/common.h.in src/common_ocl.h)
-add_executable(ocl.exe src/main.cpp src/OCLStream.cpp)
-target_compile_definitions(ocl.exe PUBLIC OCL)
-target_link_libraries(ocl.exe ${OpenCL_LIBRARY})
+  add_executable(ocl.exe src/main.cpp src/OCLStream.cpp)
+  target_compile_definitions(ocl.exe PUBLIC OCL)
+  target_link_libraries(ocl.exe ${OpenCL_LIBRARY})
 else (${OpenCL_FOUND})
-message("Skipping OpenCL...")
+  message("Skipping OpenCL...")
 endif (${OpenCL_FOUND})
 
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index e6505c6..34e0303 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -4,6 +4,8 @@
 
 #include "Stream.h"
 
+#define IMPLEMENTATION_STRING "CUDA"
+
 template <class T>
 class CUDAStream : public Stream<T>
 {
diff --git a/src/OCLStream.h b/src/OCLStream.h
index 28a9be1..f9c133e 100644
--- a/src/OCLStream.h
+++ b/src/OCLStream.h
@@ -10,6 +10,8 @@
 
 #include "Stream.h"
 
+#define IMPLEMENTATION_STRING "OpenCL"
+
 template <class T>
 class OCLStream : public Stream<T>
 {
diff --git a/src/common.h.in b/src/common.h.in
index 44dc040..fbf953c 100644
--- a/src/common.h.in
+++ b/src/common.h.in
@@ -1,4 +1,3 @@
 
 #define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@"
-#define IMPLEMENTATION_STRING "@gpu-stream_IMPLEMENTATION@"
 

From d1f8cd1b481d773790508d38545fc0f6856744a5 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 23:06:06 +0100
Subject: [PATCH 034/117] Implement some CUDA routines for device info

---
 src/CUDAStream.cu | 22 ++++++++++++++++++++++
 src/CUDAStream.h  |  4 ++++
 src/Stream.h      |  7 ++++---
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 7c9afb3..3a65f15 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -134,6 +134,28 @@ void CUDAStream<T>::triad()
   check_error();
 }
 
+template <class T>
+std::string CUDAStream<T>::getDeviceName(const int device)
+{
+  cudaSetDevice(device);
+  check_error();
+  cudaDeviceProp props;
+  cudaGetDeviceProperties(&props, device);
+  check_error();
+  return std::string(props.name);
+}
+
+template <class T>
+std::string CUDAStream<T>::getDeviceDriver(const int device)
+{
+  cudaSetDevice(device);
+  check_error();
+  int driver;
+  cudaDriverGetVersion(&driver);
+  check_error();
+  return std::to_string(driver);
+}
+
 template class CUDAStream<float>;
 template class CUDAStream<double>;
 
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 34e0303..c741f5c 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -31,5 +31,9 @@ class CUDAStream : public Stream<T>
     virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 
+    static void listDevices(void);
+    static std::string getDeviceName(const int device);
+    static std::string getDeviceDriver(const int device);
+
 };
 
diff --git a/src/Stream.h b/src/Stream.h
index ecf043b..573af83 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -2,6 +2,7 @@
 #pragma once
 
 #include <vector>
+#include <string>
 
 template <class T>
 class Stream
@@ -19,9 +20,9 @@ class Stream
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
 
     // Implementation specific device functions
-    static std::vector<int> getDeviceList();
-    static std::vector<int> getDeviceName();
-    static std::vector<int> getDeviceDriver();
+    static void listDevices(void);
+    static std::string getDeviceName(const int);
+    static std::string getDeviceDriver(const int);
 
 };
 

From f5ba77f4bd7fd99043eb9cf816172ac144a19aa5 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 23:20:10 +0100
Subject: [PATCH 035/117] List CUDA devices function

---
 src/CUDAStream.cu | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 3a65f15..2805c1e 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -134,11 +134,35 @@ void CUDAStream<T>::triad()
   check_error();
 }
 
+
+template <class T>
+void CUDAStream<T>::listDevices(void)
+{
+  // Get number of devices
+  int count;
+  cudaGetDeviceCount(&count);
+  check_error();
+
+  // Print device names
+  if (count == 0)
+  {
+    std::cout << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < count; i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
 template <class T>
 std::string CUDAStream<T>::getDeviceName(const int device)
 {
-  cudaSetDevice(device);
-  check_error();
   cudaDeviceProp props;
   cudaGetDeviceProperties(&props, device);
   check_error();

From 00305ba120580daec77fb8923f4cd834d5005da2 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 28 Apr 2016 23:37:53 +0100
Subject: [PATCH 036/117] Write to std err

---
 src/CUDAStream.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 2805c1e..7f5f25d 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -146,7 +146,7 @@ void CUDAStream<T>::listDevices(void)
   // Print device names
   if (count == 0)
   {
-    std::cout << "No devices found." << std::endl;
+    std::cerr << "No devices found." << std::endl;
   }
   else
   {

From 1a96b71935b2ee38d4f84be352d3d6df8e89d551 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 29 Apr 2016 13:59:31 +0100
Subject: [PATCH 037/117] First attempt at parse args

---
 src/main.cpp | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 008a6a2..cabac5f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,6 +7,7 @@
 #include <chrono>
 #include <algorithm>
 #include <iomanip>
+#include <string>
 
 #include "common.h"
 #include "Stream.h"
@@ -20,6 +21,7 @@
 
 const unsigned int ARRAY_SIZE = 52428800;
 const unsigned int ntimes = 10;
+unsigned int deviceIndex = 0;
 
 
 template <typename T>
@@ -28,6 +30,8 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
 template <typename T>
 void run();
 
+void parseArguments(int argc, char *argv[]);
+
 int main(int argc, char *argv[])
 {
   std::cout
@@ -35,7 +39,9 @@ int main(int argc, char *argv[])
     << "Version: " << VERSION_STRING << std::endl
     << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
 
-    run<double>();
+  parseArguments(argc, argv);
+
+  run<double>();
 
 }
 
@@ -186,3 +192,34 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
 
 }
 
+int parseUInt(const char *str, unsigned int *output)
+{
+  std::size_t next;
+  *output = std::stoul(str, &next);
+  return !next;
+}
+
+void parseArguments(int argc, char *argv[])
+{
+  for (int i = 1; i < argc; i++)
+  {
+    if (!std::string("--list").compare(argv[i]))
+    {
+      #if defined(CUDA)
+        CUDAStream<double>::listDevices();
+      #elif defined(OCL)
+        OCLStream<double>::listDevices();
+      #endif
+      exit(EXIT_SUCCESS);
+    }
+    else if (!std::string("--device").compare(argv[i]))
+    {
+      if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
+      {
+        std::cerr << "Invalid device index." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+}
+

From 3c394b9db08f58855a6e9efea9e8f9e48ebaab08 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 29 Apr 2016 18:28:21 +0100
Subject: [PATCH 038/117] Move device functions outside class

---
 src/CUDAStream.cu | 11 +++++------
 src/CUDAStream.h  |  4 ----
 src/Stream.h      | 10 +++++-----
 src/main.cpp      |  6 +-----
 4 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index 7f5f25d..e7ce539 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -135,8 +135,7 @@ void CUDAStream<T>::triad()
 }
 
 
-template <class T>
-void CUDAStream<T>::listDevices(void)
+void listDevices(void)
 {
   // Get number of devices
   int count;
@@ -160,8 +159,8 @@ void CUDAStream<T>::listDevices(void)
   }
 }
 
-template <class T>
-std::string CUDAStream<T>::getDeviceName(const int device)
+
+std::string getDeviceName(const int device)
 {
   cudaDeviceProp props;
   cudaGetDeviceProperties(&props, device);
@@ -169,8 +168,8 @@ std::string CUDAStream<T>::getDeviceName(const int device)
   return std::string(props.name);
 }
 
-template <class T>
-std::string CUDAStream<T>::getDeviceDriver(const int device)
+
+std::string getDeviceDriver(const int device)
 {
   cudaSetDevice(device);
   check_error();
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index c741f5c..34e0303 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -31,9 +31,5 @@ class CUDAStream : public Stream<T>
     virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 
-    static void listDevices(void);
-    static std::string getDeviceName(const int device);
-    static std::string getDeviceDriver(const int device);
-
 };
 
diff --git a/src/Stream.h b/src/Stream.h
index 573af83..c31d62d 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -19,10 +19,10 @@ class Stream
     virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
 
-    // Implementation specific device functions
-    static void listDevices(void);
-    static std::string getDeviceName(const int);
-    static std::string getDeviceDriver(const int);
-
 };
 
+
+// Implementation specific device functions
+static void listDevices(void);
+static std::string getDeviceName(const int);
+static std::string getDeviceDriver(const int);
diff --git a/src/main.cpp b/src/main.cpp
index cabac5f..0b9bc90 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -205,11 +205,7 @@ void parseArguments(int argc, char *argv[])
   {
     if (!std::string("--list").compare(argv[i]))
     {
-      #if defined(CUDA)
-        CUDAStream<double>::listDevices();
-      #elif defined(OCL)
-        OCLStream<double>::listDevices();
-      #endif
+      listDevices();
       exit(EXIT_SUCCESS);
     }
     else if (!std::string("--device").compare(argv[i]))

From d557915007e358b1df55e21f6ee71bfd26b87fe4 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 29 Apr 2016 18:36:47 +0100
Subject: [PATCH 039/117] Remove static keyword

---
 src/Stream.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Stream.h b/src/Stream.h
index c31d62d..527fe6a 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -23,6 +23,6 @@ class Stream
 
 
 // Implementation specific device functions
-static void listDevices(void);
-static std::string getDeviceName(const int);
-static std::string getDeviceDriver(const int);
+void listDevices(void);
+std::string getDeviceName(const int);
+std::string getDeviceDriver(const int);

From 2cb4fe74b15017acdf67c65e0c6b574f7f738eee Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 29 Apr 2016 18:38:49 +0100
Subject: [PATCH 040/117] Use original parseUInt function

---
 src/main.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 0b9bc90..28b5ab5 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,7 +7,7 @@
 #include <chrono>
 #include <algorithm>
 #include <iomanip>
-#include <string>
+#include <cstring>
 
 #include "common.h"
 #include "Stream.h"
@@ -194,9 +194,9 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
 
 int parseUInt(const char *str, unsigned int *output)
 {
-  std::size_t next;
-  *output = std::stoul(str, &next);
-  return !next;
+  char *next;
+  *output = strtoul(str, &next, 10);
+  return !strlen(next);
 }
 
 void parseArguments(int argc, char *argv[])

From 72ddd05f61507f303143826d3e7db2ca395694d8 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 29 Apr 2016 18:45:57 +0100
Subject: [PATCH 041/117] Add parse arguments code

---
 src/main.cpp | 55 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 28b5ab5..11d11bb 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -19,9 +19,10 @@
 #endif
 
 
-const unsigned int ARRAY_SIZE = 52428800;
-const unsigned int ntimes = 10;
+unsigned int ARRAY_SIZE = 52428800;
+unsigned int num_times = 10;
 unsigned int deviceIndex = 0;
+bool use_float = false;
 
 
 template <typename T>
@@ -74,7 +75,7 @@ void run()
   std::chrono::high_resolution_clock::time_point t1, t2;
 
   // Main loop
-  for (unsigned int k = 0; k < ntimes; k++)
+  for (unsigned int k = 0; k < num_times; k++)
   {
     // Execute Copy
     t1 = std::chrono::high_resolution_clock::now();
@@ -104,7 +105,7 @@ void run()
 
   // Check solutions
   stream->read_arrays(a, b, c);
-  check_solution<T>(ntimes, a, b, c);
+  check_solution<T>(num_times, a, b, c);
 
   // Display timing results
   std::cout
@@ -130,7 +131,7 @@ void run()
     auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
 
     // Calculate average; ignore the first result
-    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(ntimes - 1);
+    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
 
     // Display results
     std::cout
@@ -216,6 +217,50 @@ void parseArguments(int argc, char *argv[])
         exit(EXIT_FAILURE);
       }
     }
+    else if (!std::string("--arraysize").compare(argv[i]) ||
+             !std::string("-s").compare(argv[i]))
+    {
+      if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
+      {
+        std::cerr << "Invalid array size." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+    else if (!std::string("--numtimes").compare(argv[i]) ||
+             !std::string("-n").compare(argv[i]))
+    {
+      if (++i >= argc || !parseUInt(argv[i], &num_times))
+      {
+        std::cerr << "Invalid number of times." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+    else if (!std::string("--float").compare(argv[i]))
+    {
+      use_float = true;
+      std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl;
+    }
+    else if (!std::string("--help").compare(argv[i]) ||
+             !std::string("-h").compare(argv[i]))
+    {
+      std::cout << std::endl;
+      std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
+      std::cout << "Options:" << std::endl;
+      std::cout << "  -h  --help               Print the message" << std::endl;
+      std::cout << "      --list               List available devices" << std::endl;
+      std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
+      std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
+      std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
+      std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
+      std::cout << std::endl;
+      exit(EXIT_SUCCESS);
+    }
+    else
+    {
+      std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')"
+                << std::endl;
+      exit(EXIT_FAILURE);
+    }
   }
 }
 

From ac553589646602b0c70f55abe3423734bf2c6b3e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 10:51:16 +0100
Subject: [PATCH 042/117] Implement device info functions

---
 src/OCLStream.cpp | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 src/OCLStream.h   |  4 +++
 2 files changed, 87 insertions(+)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index e5145b4..34b7933 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -136,6 +136,89 @@ void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector
   cl::copy(d_c, c.begin(), c.end());
 }
 
+// Cache list of devices
+bool cached = false;
+std::vector<cl::Device> devices;
+
+void getDeviceList(void)
+{
+  // Get list of platforms
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+
+  // Enumerate devices
+  for (unsigned i = 0; i < platforms.size(); i++)
+  {
+    std::vector<cl::Device> plat_devices;
+    platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
+    devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
+  }
+  cached = true;
+}
+
+void listDevices(void)
+{
+  getDeviceList();
+
+  // Print device names
+  if (devices.size() == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < devices.size(); i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+
+
+}
+
+std::string getDeviceName(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string name;
+  cl_device_info info = CL_DEVICE_NAME;
+
+  if (device < devices.size())
+  {
+    devices[device].getInfo(info, &name);
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for name for non-existant device");
+  }
+
+  return name;
+
+}
+
+std::string getDeviceDriver(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string driver;
+
+  if (device < devices.size())
+  {
+    devices[device].getInfo(CL_DRIVER_VERSION, &driver);
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for driver for non-existant device");
+  }
+
+  return driver;
+}
+
 
 template class OCLStream<float>;
 template class OCLStream<double>;
diff --git a/src/OCLStream.h b/src/OCLStream.h
index f9c133e..551cf28 100644
--- a/src/OCLStream.h
+++ b/src/OCLStream.h
@@ -1,4 +1,6 @@
 
+#pragma once
+
 #include <iostream>
 #include <stdexcept>
 
@@ -48,3 +50,5 @@ class OCLStream : public Stream<T>
 
 };
 
+// Populate the devices list
+void getDeviceList(void);

From 77b521f5f0441f4d0bdd9f7bb3f73a03e210a83a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 10:52:27 +0100
Subject: [PATCH 043/117] Use float or double from CLI

---
 src/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 11d11bb..7ec7873 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -42,7 +42,10 @@ int main(int argc, char *argv[])
 
   parseArguments(argc, argv);
 
-  run<double>();
+  if (use_float)
+    run<float>();
+  else
+    run<double>();
 
 }
 

From d7c17d72d56247a04bcf8c58ca4586282d1e234b Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:02:33 +0100
Subject: [PATCH 044/117] Use device index from CLI in OpenCL

---
 src/OCLStream.cpp | 40 ++++++++++++++++++++++++++--------------
 src/OCLStream.h   |  3 ++-
 src/main.cpp      |  5 ++---
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index 34b7933..422b409 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -1,6 +1,11 @@
 
 #include "OCLStream.h"
 
+// Cache list of devices
+bool cached = false;
+std::vector<cl::Device> devices;
+void getDeviceList(void);
+
 std::string kernels{R"CLC(
 
   constant TYPE scalar = 3.0;
@@ -43,16 +48,27 @@ std::string kernels{R"CLC(
 
 
 template <class T>
-OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE)
+OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
+  if (!cached)
+    getDeviceList();
+
   array_size = ARRAY_SIZE;
 
   // Setup default OpenCL GPU
-  context = cl::Context::getDefault();
-  queue = cl::CommandQueue::getDefault();
+  if (device_index >= devices.size())
+    throw std::runtime_error("Invalid device index");
+  device = devices[device_index];
+
+  // Print out device information
+  std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+
+  context = cl::Context(device);
+  queue = cl::CommandQueue(context);
 
   // Create program
-  cl::Program program(kernels);
+  cl::Program program(context, kernels);
   if (sizeof(T) == sizeof(double))
     program.build("-DTYPE=double");
   else if (sizeof(T) == sizeof(float))
@@ -123,23 +139,19 @@ void OCLStream<T>::triad()
 template <class T>
 void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
-  cl::copy(a.begin(), a.end(), d_a);
-  cl::copy(b.begin(), b.end(), d_b);
-  cl::copy(c.begin(), c.end(), d_c);
+  cl::copy(queue, a.begin(), a.end(), d_a);
+  cl::copy(queue, b.begin(), b.end(), d_b);
+  cl::copy(queue, c.begin(), c.end(), d_c);
 }
 
 template <class T>
 void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
-  cl::copy(d_a, a.begin(), a.end());
-  cl::copy(d_b, b.begin(), b.end());
-  cl::copy(d_c, c.begin(), c.end());
+  cl::copy(queue, d_a, a.begin(), a.end());
+  cl::copy(queue, d_b, b.begin(), b.end());
+  cl::copy(queue, d_c, c.begin(), c.end());
 }
 
-// Cache list of devices
-bool cached = false;
-std::vector<cl::Device> devices;
-
 void getDeviceList(void)
 {
   // Get list of platforms
diff --git a/src/OCLStream.h b/src/OCLStream.h
index 551cf28..3fd144a 100644
--- a/src/OCLStream.h
+++ b/src/OCLStream.h
@@ -27,6 +27,7 @@ class OCLStream : public Stream<T>
     cl::Buffer d_c;
 
     // OpenCL objects
+    cl::Device device;
     cl::Context context;
     cl::CommandQueue queue;
 
@@ -37,7 +38,7 @@ class OCLStream : public Stream<T>
 
   public:
 
-    OCLStream(const unsigned int);
+    OCLStream(const unsigned int, const int);
     ~OCLStream();
 
     virtual void copy() override;
diff --git a/src/main.cpp b/src/main.cpp
index 7ec7873..1171180 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -61,11 +61,11 @@ void run()
 
 #if defined(CUDA)
   // Use the CUDA implementation
-  stream = new CUDAStream<T>(ARRAY_SIZE);
+  stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
 
 #elif defined(OCL)
   // Use the OpenCL implementation
-  stream = new OCLStream<T>(ARRAY_SIZE);
+  stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
 
 #endif
 
@@ -144,7 +144,6 @@ void run()
       << std::left << std::setw(12) << std::setprecision(5) << *minmax.second
       << std::left << std::setw(12) << std::setprecision(5) << average
       << std::endl;
-    
 
   }
 

From 3462e61c162becfb38f46e684a70badc21eae604 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:05:21 +0100
Subject: [PATCH 045/117] Check device support float

---
 src/OCLStream.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index 422b409..ed419b5 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -70,7 +70,12 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
   // Create program
   cl::Program program(context, kernels);
   if (sizeof(T) == sizeof(double))
+  {
+    // Check device can do double
+    if (!device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
+      throw std::runtime_error("Device does not support double precision, please use --float");
     program.build("-DTYPE=double");
+  }
   else if (sizeof(T) == sizeof(float))
     program.build("-DTYPE=float");
 

From fd121c2467ef0c3554f3b8e292d2b06559905b00 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:15:38 +0100
Subject: [PATCH 046/117] Use device info to select CUDA device

---
 src/CUDAStream.cu | 16 +++++++++++++++-
 src/CUDAStream.h  |  2 +-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu
index e7ce539..956be7d 100644
--- a/src/CUDAStream.cu
+++ b/src/CUDAStream.cu
@@ -12,8 +12,22 @@ void check_error(void)
 }
 
 template <class T>
-CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE)
+CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
+
+  // Set device
+  int count;
+  cudaGetDeviceCount(&count);
+  check_error();
+  if (device_index >= count)
+    throw std::runtime_error("Invalid device index");
+  cudaSetDevice(device_index);
+  check_error();
+
+  // Print out device information
+  std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+
   array_size = ARRAY_SIZE;
 
   // Check buffers fit on the device
diff --git a/src/CUDAStream.h b/src/CUDAStream.h
index 34e0303..61e4882 100644
--- a/src/CUDAStream.h
+++ b/src/CUDAStream.h
@@ -20,7 +20,7 @@ class CUDAStream : public Stream<T>
 
   public:
 
-    CUDAStream(const unsigned int);
+    CUDAStream(const unsigned int, const int);
     ~CUDAStream();
 
     virtual void copy() override;

From 2738e75b0400da523b094edd7491701c6a38545e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:20:39 +0100
Subject: [PATCH 047/117] Print out array sizes

---
 src/main.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/main.cpp b/src/main.cpp
index 1171180..a0fbaf9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -56,6 +56,13 @@ void run()
   std::vector<T> a(ARRAY_SIZE, 1.0);
   std::vector<T> b(ARRAY_SIZE, 2.0);
   std::vector<T> c(ARRAY_SIZE, 0.0);
+  std::streamsize ss = std::cout.precision();
+  std::cout << std::setprecision(1) << std::fixed
+    << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
+    << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
+  std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
+    << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
+  std::cout.precision(ss);
 
   Stream<T> *stream;
 

From 26bb9126466be72504e4710f6530e2d43112d5fb Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:23:36 +0100
Subject: [PATCH 048/117] Check OCL device has enough memory for buffers

---
 src/OCLStream.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index ed419b5..e962ea5 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -53,8 +53,6 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
   if (!cached)
     getDeviceList();
 
-  array_size = ARRAY_SIZE;
-
   // Setup default OpenCL GPU
   if (device_index >= devices.size())
     throw std::runtime_error("Invalid device index");
@@ -85,6 +83,16 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
   add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
   triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
 
+  array_size = ARRAY_SIZE;
+
+  // Check buffers fit on the device
+  cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+  cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
+  if (maxbuffer < sizeof(T)*ARRAY_SIZE)
+    throw std::runtime_error("Device cannot allocate a buffer big enough");
+  if (totalmem < 3*sizeof(T)*ARRAY_SIZE)
+    throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
   // Create buffers
   d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
   d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);

From e91c31b44af503f45ac554c01ec25ac4348163d2 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:37:35 +0100
Subject: [PATCH 049/117] Tidy up delete of object with correct deconstructors
 and delete

---
 src/OCLStream.cpp | 8 ++++----
 src/Stream.h      | 3 +++
 src/main.cpp      | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/OCLStream.cpp b/src/OCLStream.cpp
index e962ea5..5a72b3c 100644
--- a/src/OCLStream.cpp
+++ b/src/OCLStream.cpp
@@ -103,10 +103,10 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 template <class T>
 OCLStream<T>::~OCLStream()
 {
-  delete[] copy_kernel;
-  delete[] mul_kernel;
-  delete[] add_kernel;
-  delete[] triad_kernel;
+  delete copy_kernel;
+  delete mul_kernel;
+  delete add_kernel;
+  delete triad_kernel;
 }
 
 template <class T>
diff --git a/src/Stream.h b/src/Stream.h
index 527fe6a..e02c953 100644
--- a/src/Stream.h
+++ b/src/Stream.h
@@ -8,6 +8,9 @@ template <class T>
 class Stream
 {
   public:
+
+    virtual ~Stream(){}
+
     // Kernels
     // These must be blocking calls
     virtual void copy() = 0;
diff --git a/src/main.cpp b/src/main.cpp
index a0fbaf9..457ca9a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -154,7 +154,7 @@ void run()
 
   }
 
-  delete[] stream;
+  delete stream;
 
 }
 

From 95f9efb7d92a7ed741f2ce20d4c949fe89363a8b Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:40:46 +0100
Subject: [PATCH 050/117] Remove old version

---
 Makefile              |  27 ---
 common.cpp            | 115 ----------
 common.h              | 112 ----------
 cuda-stream.cu        | 397 ----------------------------------
 ocl-stream-kernels.cl |  70 ------
 ocl-stream.cpp        | 488 ------------------------------------------
 6 files changed, 1209 deletions(-)
 delete mode 100644 Makefile
 delete mode 100644 common.cpp
 delete mode 100644 common.h
 delete mode 100644 cuda-stream.cu
 delete mode 100644 ocl-stream-kernels.cl
 delete mode 100644 ocl-stream.cpp

diff --git a/Makefile b/Makefile
deleted file mode 100644
index 4fb5f7a..0000000
--- a/Makefile
+++ /dev/null
@@ -1,27 +0,0 @@
-LDLIBS = -l OpenCL
-CXXFLAGS = -std=c++11 -O3
-
-PLATFORM = $(shell uname -s)
-ifeq ($(PLATFORM), Darwin)
-	LDLIBS = -framework OpenCL
-endif
-
-all: gpu-stream-ocl gpu-stream-cuda
-
-gpu-stream-ocl: ocl-stream.cpp common.o Makefile
-	$(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS)
-
-common.o: common.cpp common.h Makefile
-
-gpu-stream-cuda: cuda-stream.cu common.o Makefile
-ifeq ($(shell which nvcc > /dev/null; echo $$?), 0)
-	nvcc $(CXXFLAGS) common.o $< -o $@
-else
-	$(error "Cannot find nvcc, please install CUDA toolkit")
-endif
-
-.PHONY: clean
-
-clean:
-	rm -f gpu-stream-ocl gpu-stream-cuda *.o
-
diff --git a/common.cpp b/common.cpp
deleted file mode 100644
index 781d70e..0000000
--- a/common.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-#include "common.h"
-
-// Default array size 50 * 2^20 (50*8 Mebibytes double precision)
-// Use binary powers of two so divides 1024
-unsigned int ARRAY_SIZE = 52428800;
-
-unsigned int NTIMES = 10;
-
-bool useFloat = false;
-
-unsigned int deviceIndex = 0;
-
-int parseUInt(const char *str, unsigned int *output)
-{
-    char *next;
-    *output = strtoul(str, &next, 10);
-    return !strlen(next);
-}
-
-void parseArguments(int argc, char *argv[])
-{
-    for (int i = 1; i < argc; i++)
-    {
-        if (!strcmp(argv[i], "--list"))
-        {
-            listDevices();
-            exit(0);
-        }
-        else if (!strcmp(argv[i], "--device"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
-            {
-                std::cout << "Invalid device index" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
-            {
-                std::cout << "Invalid array size" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &NTIMES))
-            {
-                std::cout << "Invalid number of times" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--float"))
-        {
-            useFloat = true;
-            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl;
-        }
-        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
-        {
-            std::cout << std::endl;
-            std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl;
-            std::cout << "Options:" << std::endl;
-            std::cout << "  -h  --help               Print the message" << std::endl;
-            std::cout << "      --list               List available devices" << std::endl;
-            std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
-            std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
-            std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
-            std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
-            std::cout << std::endl;
-            exit(0);
-        }
-        else
-        {
-            std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')"
-                << std::endl;
-            exit(1);
-        }
-    }
-}
diff --git a/common.h b/common.h
deleted file mode 100644
index a4dd886..0000000
--- a/common.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-#include <iomanip>
-#include <iostream>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <stdexcept>
-
-#define VERSION_STRING "1.0"
-
-extern void parseArguments(int argc, char *argv[]);
-
-extern void listDevices(void);
-
-extern unsigned int ARRAY_SIZE;
-extern unsigned int NTIMES;
-
-extern bool useFloat;
-
-extern unsigned int deviceIndex;
-
-
-template < typename T >
-void check_solution(void* a_in, void* b_in, void* c_in)
-{
-    // Generate correct solution
-    T golda = 1.0;
-    T goldb = 2.0;
-    T goldc = 0.0;
-
-    T * a = static_cast<T*>(a_in);
-    T * b = static_cast<T*>(b_in);
-    T * c = static_cast<T*>(c_in);
-
-    const T scalar = 3.0;
-
-    for (unsigned int i = 0; i < NTIMES; i++)
-    {
-        // Double
-        goldc = golda;
-        goldb = scalar * goldc;
-        goldc = golda + goldb;
-        golda = goldb + scalar * goldc;
-    }
-
-    // Calculate average error
-    double erra = 0.0;
-    double errb = 0.0;
-    double errc = 0.0;
-
-    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
-    {
-        erra += fabs(a[i] - golda);
-        errb += fabs(b[i] - goldb);
-        errc += fabs(c[i] - goldc);
-    }
-
-    erra /= ARRAY_SIZE;
-    errb /= ARRAY_SIZE;
-    errc /= ARRAY_SIZE;
-
-    double epsi = std::numeric_limits<T>::epsilon() * 100;
-
-    if (erra > epsi)
-        std::cout
-            << "Validation failed on a[]. Average error " << erra
-            << std::endl;
-    if (errb > epsi)
-        std::cout
-            << "Validation failed on b[]. Average error " << errb
-            << std::endl;
-    if (errc > epsi)
-        std::cout
-            << "Validation failed on c[]. Average error " << errc
-            << std::endl;
-}
-
diff --git a/cuda-stream.cu b/cuda-stream.cu
deleted file mode 100644
index 2ab3adb..0000000
--- a/cuda-stream.cu
+++ /dev/null
@@ -1,397 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <chrono>
-#include <cfloat>
-#include <cmath>
-
-#include <cuda.h>
-#include "common.h"
-
-std::string getDeviceName(int device);
-int getDriver(void);
-
-// Code to check CUDA errors
-void check_cuda_error(void)
-{
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess)
-    {
-        std::cerr
-            << "Error: "
-            << cudaGetErrorString(err)
-            << std::endl;
-            exit(err);
-    }
-}
-
-template <typename T>
-__global__ void copy(const T * a, T * c)
-{
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    c[i] = a[i];
-}
-
-template <typename T>
-__global__ void mul(T * b, const T * c)
-{
-    const T scalar = 3.0;
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    b[i] = scalar * c[i];
-}
-
-template <typename T>
-__global__ void add(const T * a, const T * b, T * c)
-{
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    c[i] = a[i] + b[i];
-}
-
-template <typename T>
-__global__ void triad(T * a, const T * b, const T * c)
-{
-    const T scalar = 3.0;
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    a[i] = b[i] + scalar * c[i];
-}
-
-int main(int argc, char *argv[])
-{
-
-    // Print out run information
-    std::cout
-        << "GPU-STREAM" << std::endl
-        << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: CUDA" << std::endl;
-
-    parseArguments(argc, argv);
-
-    if (NTIMES < 2)
-        throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
-
-    std::cout << "Precision: ";
-    if (useFloat) std::cout << "float";
-    else std::cout << "double";
-    std::cout << std::endl << std::endl;
-
-    std::cout << "Running kernels " << NTIMES << " times" << std::endl;
-
-    if (ARRAY_SIZE % 1024 != 0)
-    {
-        unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
-        ARRAY_SIZE -= ARRAY_SIZE % 1024;
-        std::cout
-            << "Warning: array size must divide 1024" << std::endl
-            << "Resizing array from " << OLD_ARRAY_SIZE
-            << " to " << ARRAY_SIZE << std::endl;
-        if (ARRAY_SIZE == 0)
-            throw std::runtime_error("Array size must be >= 1024");
-    }
-
-    // Get precision (used to reset later)
-    std::streamsize ss = std::cout.precision();
-
-    size_t DATATYPE_SIZE;
-
-    if (useFloat)
-    {
-        DATATYPE_SIZE = sizeof(float);
-    }
-    else
-    {
-        DATATYPE_SIZE = sizeof(double);
-    }
-
-    // Display number of bytes in array
-    std::cout << std::setprecision(1) << std::fixed
-        << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-        << std::endl;
-    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-        << std::endl;
-
-    // Reset precision
-    std::cout.precision(ss);
-
-    // Check device index is in range
-    int count;
-    cudaGetDeviceCount(&count);
-    check_cuda_error();
-    if (deviceIndex >= count)
-        throw std::runtime_error("Chosen device index is invalid");
-    cudaSetDevice(deviceIndex);
-    check_cuda_error();
-
-    // Print out device name
-    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
-
-    // Print out device CUDA driver version
-    std::cout << "Driver: " << getDriver() << std::endl;
-
-    // Check buffers fit on the device
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, deviceIndex);
-    if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
-        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
-
-    // Create host vectors
-    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-
-    // Initilise arrays
-    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
-    {
-        if (useFloat)
-        {
-            ((float*)h_a)[i] = 1.0f;
-            ((float*)h_b)[i] = 2.0f;
-            ((float*)h_c)[i] = 0.0f;
-        }
-        else
-        {
-            ((double*)h_a)[i] = 1.0;
-            ((double*)h_b)[i] = 2.0;
-            ((double*)h_c)[i] = 0.0;
-        }
-    }
-
-    // Create device buffers
-    void * d_a, * d_b, *d_c;
-    cudaMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
-    check_cuda_error();
-    cudaMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
-    check_cuda_error();
-    cudaMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
-    check_cuda_error();
-
-    // Copy host memory to device
-    cudaMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
-    check_cuda_error();
-    cudaMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
-    check_cuda_error();
-    cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
-    check_cuda_error();
-
-    // Make sure the copies are finished
-    cudaDeviceSynchronize();
-    check_cuda_error();
-
-    // List of times
-    std::vector< std::vector<double> > timings;
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Main loop
-    for (unsigned int k = 0; k < NTIMES; k++)
-    {
-        std::vector<double> times;
-        t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
-        else
-            copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
-        else
-            mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (useFloat)
-            triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
-        else
-            triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-        timings.push_back(times);
-
-    }
-
-    // Check solutions
-    cudaMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
-    check_cuda_error();
-    cudaMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
-    check_cuda_error();
-    cudaMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
-    check_cuda_error();
-
-    if (useFloat)
-    {
-        check_solution<float>(h_a, h_b, h_c);
-    }
-    else
-    {
-        check_solution<double>(h_a, h_b, h_c);
-    }
-
-    // Crunch results
-    size_t sizes[4] = {
-        2 * DATATYPE_SIZE * ARRAY_SIZE,
-        2 * DATATYPE_SIZE * ARRAY_SIZE,
-        3 * DATATYPE_SIZE * ARRAY_SIZE,
-        3 * DATATYPE_SIZE * ARRAY_SIZE
-    };
-    double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
-    double max[4] = {0.0, 0.0, 0.0, 0.0};
-    double avg[4] = {0.0, 0.0, 0.0, 0.0};
-
-    // Ignore first result
-    for (unsigned int i = 1; i < NTIMES; i++)
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            avg[j] += timings[i][j];
-            min[j] = std::min(min[j], timings[i][j]);
-            max[j] = std::max(max[j], timings[i][j]);
-        }
-    }
-
-    for (int j = 0; j < 4; j++)
-        avg[j] /= (double)(NTIMES-1);
-
-    // Display results
-    std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
-    std::cout
-        << std::left << std::setw(12) << "Function"
-        << std::left << std::setw(12) << "MBytes/sec"
-        << std::left << std::setw(12) << "Min (sec)"
-        << std::left << std::setw(12) << "Max"
-        << std::left << std::setw(12) << "Average"
-        << std::endl;
-
-    for (int j = 0; j < 4; j++)
-    {
-        std::cout
-            << std::left << std::setw(12) << labels[j]
-            << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
-            << std::left << std::setw(12) << std::setprecision(5) << min[j]
-            << std::left << std::setw(12) << std::setprecision(5) << max[j]
-            << std::left << std::setw(12) << std::setprecision(5) << avg[j]
-            << std::endl;
-    }
-
-    // Free host vectors
-    free(h_a);
-    free(h_b);
-    free(h_c);
-
-    // Free cuda buffers
-    cudaFree(d_a);
-    check_cuda_error();
-    cudaFree(d_b);
-    check_cuda_error();
-    cudaFree(d_c);
-    check_cuda_error();
-
-}
-
-std::string getDeviceName(int device)
-{
-    struct cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, device);
-    check_cuda_error();
-    return std::string(prop.name);
-}
-
-int getDriver(void)
-{
-    int driver;
-    cudaDriverGetVersion(&driver);
-    check_cuda_error();
-    return driver;
-}
-
-void listDevices(void)
-{
-    // Get number of devices
-    int count;
-    cudaGetDeviceCount(&count);
-    check_cuda_error();
-
-    // Print device names
-    if (count == 0)
-    {
-        std::cout << "No devices found." << std::endl;
-    }
-    else
-    {
-        std::cout << std::endl;
-        std::cout << "Devices:" << std::endl;
-        for (int i = 0; i < count; i++)
-        {
-            std::cout << i << ": " << getDeviceName(i) << std::endl;
-            check_cuda_error();
-        }
-        std::cout << std::endl;
-    }
-}
-
diff --git a/ocl-stream-kernels.cl b/ocl-stream-kernels.cl
deleted file mode 100644
index e5af7ce..0000000
--- a/ocl-stream-kernels.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-
-#ifdef FLOAT
-	#define DATATYPE float
-	constant DATATYPE scalar = 3.0f;
-#else
-	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-	#define DATATYPE double
-	constant DATATYPE scalar = 3.0;
-#endif
-
-
-kernel void copy(global const DATATYPE * restrict a, global DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	c[i] = a[i];
-}
-
-kernel void mul(global DATATYPE * restrict b, global const DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	b[i] = scalar * c[i];
-}
-
-kernel void add(global const DATATYPE * restrict a, global const DATATYPE * restrict b, global DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	c[i] = a[i] + b[i];
-}
-
-kernel void triad(global DATATYPE * restrict a, global const DATATYPE * restrict b, global const DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	a[i] = b[i] + scalar * c[i];
-}
diff --git a/ocl-stream.cpp b/ocl-stream.cpp
deleted file mode 100644
index 1a46295..0000000
--- a/ocl-stream.cpp
+++ /dev/null
@@ -1,488 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <chrono>
-#include <cfloat>
-#include <cmath>
-
-#define CL_HPP_ENABLE_EXCEPTIONS
-#define CL_HPP_MINIMUM_OPENCL_VERSION 110
-#define CL_HPP_TARGET_OPENCL_VERSION 110
-#include "CL/cl2.hpp"
-#include "common.h"
-
-std::string getDeviceName(const cl::Device& device);
-std::string getDeviceDriver(const cl::Device& device);
-unsigned getDeviceList(std::vector<cl::Device>& devices);
-
-
-// Print error and exit
-void die(std::string msg, cl::Error& e)
-{
-    std::cerr
-            << "Error: "
-            << msg
-            << ": " << e.what()
-            << "(" << e.err() << ")"
-            << std::endl;
-    exit(e.err());
-}
-
-
-int main(int argc, char *argv[])
-{
-
-    // Print out run information
-    std::cout
-        << "GPU-STREAM" << std::endl
-        << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: OpenCL" << std::endl;
-
-    std::string status;
-
-    try
-    {
-        parseArguments(argc, argv);
-        if (NTIMES < 2)
-            throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
-
-
-        std::cout << "Precision: ";
-        if (useFloat) std::cout << "float";
-        else std::cout << "double";
-        std::cout << std::endl << std::endl;
-
-        std::cout << "Running kernels " << NTIMES << " times" << std::endl;
-
-        if (ARRAY_SIZE % 1024 != 0)
-        {
-            unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
-            ARRAY_SIZE -= ARRAY_SIZE % 1024;
-            std::cout
-                << "Warning: array size must divide 1024" << std::endl
-                << "Resizing array from " << OLD_ARRAY_SIZE
-                << " to " << ARRAY_SIZE << std::endl;
-            if (ARRAY_SIZE == 0)
-                throw std::runtime_error("Array size must be >= 1024");
-        }
-
-        // Get precision (used to reset later)
-        std::streamsize ss = std::cout.precision();
-
-        size_t DATATYPE_SIZE;
-
-        if (useFloat)
-        {
-            DATATYPE_SIZE = sizeof(float);
-        }
-        else
-        {
-            DATATYPE_SIZE = sizeof(double);
-        }
-
-        // Display number of bytes in array
-        std::cout << std::setprecision(1) << std::fixed
-            << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-            << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-            << std::endl;
-        std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-            << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-            << std::endl;
-
-        // Reset precision
-        std::cout.precision(ss);
-
-        // Open the Kernel source
-        std::string kernels;
-        {
-            std::ifstream in("ocl-stream-kernels.cl");
-            if (!in.is_open())
-                throw std::runtime_error("Cannot open kernel file");
-            kernels = std::string (std::istreambuf_iterator<char>(in), (std::istreambuf_iterator<char>()));
-        }
-
-
-        // Setup OpenCL
-
-        // Get list of devices
-        std::vector<cl::Device> devices;
-        getDeviceList(devices);
-
-        // Check device index is in range
-        if (deviceIndex >= devices.size())
-            throw std::runtime_error("Chosen device index is invalid");
-
-        cl::Device device = devices[deviceIndex];
-
-        status = "Creating context";
-        cl::Context context(device);
-
-        status = "Creating queue";
-        cl::CommandQueue queue(context);
-
-        status = "Creating program";
-        cl::Program program(context, kernels);
-
-        // Print out device name
-        std::string name = getDeviceName(device);
-        std::cout << "Using OpenCL device " << name << std::endl;
-
-        // Print out OpenCL driver version for this device
-        std::string driver = getDeviceDriver(device);
-        std::cout << "Driver: " << driver << std::endl;
-
-        // Check device can do double precision if requested
-        if (!useFloat && !device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
-            throw std::runtime_error("Device does not support double precision, please use --float");
-
-        // Check buffers fit on the device
-        status = "Getting device memory sizes";
-        cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
-        cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
-        if (maxbuffer < DATATYPE_SIZE*ARRAY_SIZE)
-            throw std::runtime_error("Device cannot allocate a buffer big enough");
-        if (totalmem < 3*DATATYPE_SIZE*ARRAY_SIZE)
-            throw std::runtime_error("Device does not have enough memory for all 3 buffers");
-
-        try
-        {
-            std::string options = "";
-            if (useFloat)
-                options = "-DFLOAT";
-            program.build(options.c_str());
-        }
-        catch (cl::Error& e)
-        {
-            std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
-            std::string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
-            std::cerr
-                << "Build error:"
-                << buildlog
-                << std::endl;
-            throw e;
-        }
-
-        status = "Making kernel copy";
-        auto copy = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "copy");
-        status = "Making kernel mul";
-        auto mul = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "mul");
-        status = "Making kernel add";
-        auto add = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "add");
-        status = "Making kernel triad";
-        auto triad = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "triad");
-
-        // Create host vectors
-        void *h_a = malloc(ARRAY_SIZE * DATATYPE_SIZE);
-        void *h_b = malloc(ARRAY_SIZE * DATATYPE_SIZE);
-        void *h_c = malloc(ARRAY_SIZE * DATATYPE_SIZE);
-
-        // Initilise arrays
-        for (unsigned int i = 0; i < ARRAY_SIZE; i++)
-        {
-            if (useFloat)
-            {
-                ((float*)h_a)[i] = 1.0f;
-                ((float*)h_b)[i] = 2.0f;
-                ((float*)h_c)[i] = 0.0f;
-            }
-            else
-            {
-                ((double*)h_a)[i] = 1.0;
-                ((double*)h_b)[i] = 2.0;
-                ((double*)h_c)[i] = 0.0;
-            }
-        }
-
-        // Create device buffers
-        status = "Creating buffers";
-        cl::Buffer d_a(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
-        cl::Buffer d_b(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
-        cl::Buffer d_c(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
-
-
-        // Copy host memory to device
-        status = "Copying buffers";
-        queue.enqueueWriteBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
-        queue.enqueueWriteBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
-        queue.enqueueWriteBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
-
-        // Make sure the copies are finished
-        queue.finish();
-
-
-        // List of times
-        std::vector< std::vector<double> > timings;
-
-        // Declare timers
-        std::chrono::high_resolution_clock::time_point t1, t2;
-
-        // Main loop
-        for (unsigned int k = 0; k < NTIMES; k++)
-        {
-            status = "Executing copy";
-            std::vector<double> times;
-            t1 = std::chrono::high_resolution_clock::now();
-            copy(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_a, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-            status = "Executing mul";
-            t1 = std::chrono::high_resolution_clock::now();
-            mul(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_b, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-            status = "Executing add";
-            t1 = std::chrono::high_resolution_clock::now();
-            add(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_a, d_b, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-            status = "Executing triad";
-            t1 = std::chrono::high_resolution_clock::now();
-            triad(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_a, d_b, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-            timings.push_back(times);
-
-        }
-
-        // Check solutions
-        status = "Copying back buffers";
-        queue.enqueueReadBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
-        queue.enqueueReadBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
-        queue.enqueueReadBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
-        queue.finish();
-
-
-        if (useFloat)
-        {
-            check_solution<float>(h_a, h_b, h_c);
-        }
-        else
-        {
-            check_solution<double>(h_a, h_b, h_c);
-        }
-
-        // Crunch results
-        size_t sizes[4] = {
-            2 * DATATYPE_SIZE * ARRAY_SIZE,
-            2 * DATATYPE_SIZE * ARRAY_SIZE,
-            3 * DATATYPE_SIZE * ARRAY_SIZE,
-            3 * DATATYPE_SIZE * ARRAY_SIZE
-        };
-        double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
-        double max[4] = {0.0, 0.0, 0.0, 0.0};
-        double avg[4] = {0.0, 0.0, 0.0, 0.0};
-        // Ignore first result
-        for (unsigned int i = 1; i < NTIMES; i++)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                avg[j] += timings[i][j];
-                min[j] = std::min(min[j], timings[i][j]);
-                max[j] = std::max(max[j], timings[i][j]);
-            }
-        }
-        for (int j = 0; j < 4; j++)
-            avg[j] /= (double)(NTIMES-1);
-
-        // Display results
-        std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
-        std::cout
-            << std::left << std::setw(12) << "Function"
-            << std::left << std::setw(12) << "MBytes/sec"
-            << std::left << std::setw(12) << "Min (sec)"
-            << std::left << std::setw(12) << "Max"
-            << std::left << std::setw(12) << "Average"
-            << std::endl;
-        for (int j = 0; j < 4; j++)
-        {
-            std::cout
-                << std::left << std::setw(12) << labels[j]
-                << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
-                << std::left << std::setw(12) << std::setprecision(5) << min[j]
-                << std::left << std::setw(12) << std::setprecision(5) << max[j]
-                << std::left << std::setw(12) << std::setprecision(5) << avg[j]
-                << std::endl;
-        }
-
-        // Free host vectors
-        free(h_a);
-        free(h_b);
-        free(h_c);
-
-    }
-    catch (cl::Error &e)
-    {
-        die(status, e);
-    }
-    catch (std::exception& e)
-    {
-        std::cerr
-            << "Error: "
-            << e.what()
-            << std::endl;
-        exit(EXIT_FAILURE);
-    }
-
-}
-
-
-unsigned getDeviceList(std::vector<cl::Device>& devices)
-{
-    // Get list of platforms
-    std::vector<cl::Platform> platforms;
-    try
-    {
-        cl::Platform::get(&platforms);
-    }
-    catch (cl::Error &e)
-    {
-        die("Getting platforms", e);
-    }
-
-    // Enumerate devices
-    for (unsigned int i = 0; i < platforms.size(); i++)
-    {
-        std::vector<cl::Device> plat_devices;
-        try
-        {
-            platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
-        }
-        catch (cl::Error &e)
-        {
-            die("Getting devices", e);
-        }
-        devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
-    }
-
-    return devices.size();
-}
-
-
-std::string getDeviceName(const cl::Device& device)
-{
-    std::string name;
-    cl_device_info info = CL_DEVICE_NAME;
-
-    try
-    {
-
-        // Special case for AMD
-#ifdef CL_DEVICE_BOARD_NAME_AMD
-        device.getInfo(CL_DEVICE_VENDOR, &name);
-        if (strstr(name.c_str(), "Advanced Micro Devices"))
-            info = CL_DEVICE_BOARD_NAME_AMD;
-#endif
-
-        device.getInfo(info, &name);
-    }
-    catch (cl::Error &e)
-    {
-        die("Getting device name", e);
-    }
-
-    return name;
-}
-
-std::string getDeviceDriver(const cl::Device& device)
-{
-    std::string driver;
-    try
-    {
-        device.getInfo(CL_DRIVER_VERSION, &driver);
-    }
-    catch (cl::Error &e)
-    {
-        die("Getting device driver", e);
-    }
-
-    return driver;
-}
-
-
-void listDevices(void)
-{
-    // Get list of devices
-    std::vector<cl::Device> devices;
-    getDeviceList(devices);
-
-    // Print device names
-    if (devices.size() == 0)
-    {
-        std::cout << "No devices found." << std::endl;
-    }
-    else
-    {
-        std::cout << std::endl;
-        std::cout << "Devices:" << std::endl;
-        for (unsigned i = 0; i < devices.size(); i++)
-        {
-            std::cout << i << ": " << getDeviceName(devices[i]) << std::endl;
-        }
-        std::cout << std::endl;
-    }
-}
-

From 83516ae35279d78a1833572b2a66c0e0a6a5942e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:41:00 +0100
Subject: [PATCH 051/117] Update cl2.hpp

---
 CL/cl2.hpp | 209 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 127 insertions(+), 82 deletions(-)

diff --git a/CL/cl2.hpp b/CL/cl2.hpp
index ad0c7c4..e0f55fe 100644
--- a/CL/cl2.hpp
+++ b/CL/cl2.hpp
@@ -28,11 +28,11 @@
 
 /*! \file
  *
- *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), 
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
  *       OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29)
  *   \author Lee Howes and Bruce Merry
- *   
- *   Derived from the OpenCL 1.x C++ bindings written by 
+ *
+ *   Derived from the OpenCL 1.x C++ bindings written by
  *   Benedict R. Gaster, Laurent Morichetti and Lee Howes
  *   With additions and fixes from:
  *       Brian Cole, March 3rd 2010 and April 2012
@@ -52,6 +52,18 @@
  *         #define CL_HPP_USE_DX_INTEROP
  *         cl_khr_sub_groups
  *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
+ *
+ *   Doxygen documentation for this header is available here:
+ *
+ *       http://khronosgroup.github.io/OpenCL-CLHPP/
+ *
+ *   The latest version of this header can be found on the GitHub releases page:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP/releases
+ *
+ *   Bugs and patches can be submitted to the GitHub repository:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP
  */
 
 /*! \mainpage
@@ -134,41 +146,64 @@
  *
  * \section parameterization Parameters
  * This header may be parameterized by a set of preprocessor macros.
- * CL_HPP_TARGET_OPENCL_VERSION
- *  - Defines the target OpenCL runtime version to build the header against.
- *    Defaults to 200, representing OpenCL 2.0.
- * CL_HPP_NO_STD_STRING
- *  - Do not use the standard library string class.
- *    cl::string is not defined and may be defined by the user before 
- *    cl2.hpp is included.
- * CL_HPP_NO_STD_VECTOR
- *  - Do not use the standard library vector class.
- *    cl::vector is not defined and may be defined by the user before
- *    cl2.hpp is included. 
- * CL_HPP_NO_STD_ARRAY
- *  - Do not use the standard library array class.
- *    cl::array is not defined and may be defined by the user before
- *    cl2.hpp is included.
- * CL_HPP_NO_STD_UNIQUE_PTR
- *  - Do not use the standard library unique_ptr class.
- *    cl::pointer and the cl::allocate_pointer function are not defined 
- *    and may be defined by the user before cl2.hpp is included.
- * CL_HPP_ENABLE_DEVICE_FISSION 
- *  - Enables device fission for OpenCL 1.2 platforms
- * CL_HPP_ENABLE_EXCEPTIONS
- *  - Enable exceptions for use in the C++ bindings header.
- *    This is the preferred error handling mechanism but is not required.
- * CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
- *  - Backward compatibility option to support cl.hpp-style size_t class.
- *    Replaces the updated std::array derived version and removal of size_t
- *    from the namespace. Note that in this case the new size_t class
- *    is placed in the cl::compatibility namespace and thus requires
- *    an additional using declaration for direct backward compatibility.
- * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 
- *  - Enable older vector of pairs interface for construction of programs.
- * CL_HPP_CL_1_2_DEFAULT_BUILD
- *  - Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
- *  - applies to use of cl::Program construction and other program build variants.
+ *
+ * - CL_HPP_TARGET_OPENCL_VERSION
+ *
+ *   Defines the target OpenCL runtime version to build the header
+ *   against. Defaults to 200, representing OpenCL 2.0.
+ *
+ * - CL_HPP_NO_STD_STRING
+ *
+ *   Do not use the standard library string class. cl::string is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_VECTOR
+ *
+ *   Do not use the standard library vector class. cl::vector is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_ARRAY
+ *
+ *   Do not use the standard library array class. cl::array is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_UNIQUE_PTR
+ *
+ *   Do not use the standard library unique_ptr class. cl::pointer and
+ *   the cl::allocate_pointer functions are not defined and may be
+ *   defined by the user before cl2.hpp is included.
+ *
+ * - CL_HPP_ENABLE_DEVICE_FISSION
+ *
+ *   Enables device fission for OpenCL 1.2 platforms.
+ *
+ * - CL_HPP_ENABLE_EXCEPTIONS
+ *
+ *   Enable exceptions for use in the C++ bindings header. This is the
+ *   preferred error handling mechanism but is not required.
+ *
+ * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
+ *
+ *   Backward compatibility option to support cl.hpp-style size_t
+ *   class.  Replaces the updated std::array derived version and
+ *   removal of size_t from the namespace. Note that in this case the
+ *   new size_t class is placed in the cl::compatibility namespace and
+ *   thus requires an additional using declaration for direct backward
+ *   compatibility.
+ *
+ * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
+ *
+ *   Enable older vector of pairs interface for construction of
+ *   programs.
+ *
+ * - CL_HPP_CL_1_2_DEFAULT_BUILD
+ *
+ *   Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
+ *   applies to use of cl::Program construction and other program
+ *   build variants.
  *
  *
  * \section example Example
@@ -177,19 +212,19 @@
  * bindings, including support for the optional exception feature and
  * also the supplied vector and string classes, see following sections for
  * decriptions of these features.
- * 
+ *
  * \code
     #define CL_HPP_ENABLE_EXCEPTIONS
     #define CL_HPP_TARGET_OPENCL_VERSION 200
-   
+
     #include <CL/cl2.hpp>
     #include <iostream>
     #include <vector>
     #include <memory>
     #include <algorithm>
-  
+
     const int numElements = 32;
-  
+
     int main(void)
     {
         // Filter for a 2.0 platform and set it as the default
@@ -212,35 +247,45 @@
             std::cout << "Error setting default platform.";
             return -1;
         }
-    
-        std::string kernel1{
-            "global int globalA;"
-            "kernel void updateGlobal(){"
-            "  globalA = 75;"
-            "}"};
-        std::string kernel2{
-            "typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, global int *output, int val, write_only pipe int outPipe, queue_t childQueue){"
-            "  output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);"
-            "  write_pipe(outPipe, &val);"
-            "  queue_t default_queue = get_default_queue(); "
-            "  ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); "
-            // Have a child kernel write into third quarter of output
-            "  enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
-            "    ^{"
-            "      output[get_global_size(0)*2 + get_global_id(0)] = inputA[get_global_size(0)*2+get_global_id(0)] + inputB[get_global_size(0)*2+get_global_id(0)] + globalA;"
-            "    });"
-            // Have a child kernel write into last quarter of output
-            "  enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
-            "    ^{"
-            "      output[get_global_size(0)*3 + get_global_id(0)] = inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;"
-            "    });"
-            "}" };
+
+        // Use C++11 raw string literals for kernel source code
+        std::string kernel1{R"CLC(
+            global int globalA;
+            kernel void updateGlobal()
+            {
+              globalA = 75;
+            }
+        )CLC"};
+        std::string kernel2{R"CLC(
+            typedef struct { global int *bar; } Foo;
+            kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
+                                  global int *output, int val, write_only pipe int outPipe, queue_t childQueue)
+            {
+              output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);
+              write_pipe(outPipe, &val);
+              queue_t default_queue = get_default_queue();
+              ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);
+
+              // Have a child kernel write into third quarter of output
+              enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*2 + get_global_id(0)] =
+                      inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;
+                });
+
+              // Have a child kernel write into last quarter of output
+              enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*3 + get_global_id(0)] =
+                      inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;
+                });
+            }
+        )CLC"};
 
         // New simpler string interface style
         std::vector<std::string> programStrings {kernel1, kernel2};
 
-        cl::Program vectorAddProgram(
-            programStrings);
+        cl::Program vectorAddProgram(programStrings);
         try {
             vectorAddProgram.build("-cl-std=CL2.0");
         }
@@ -251,7 +296,7 @@
             for (auto &pair : buildInfo) {
                 std::cerr << pair.second << std::endl << std::endl;
             }
-        
+
             return 1;
         }
 
@@ -264,17 +309,17 @@
         program2Kernel(
             cl::EnqueueArgs(
             cl::NDRange(1)));
-    
+
         //////////////////
         // SVM allocations
-    
-        cl::pointer<int> anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
+
+        auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
         *anSVMInt = 5;
-        cl::SVMAllocator<int, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
+        cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
         auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
         fooPointer->bar = anSVMInt.get();
         cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
-        std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);    
+        std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
         cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
 
         //
@@ -284,7 +329,7 @@
         std::vector<int> output(numElements, 0xdeadbeef);
         cl::Buffer outputBuffer(begin(output), end(output), false);
         cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
-    
+
         // Default command queue, also passed in as a parameter
         cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault(
             cl::Context::getDefault(), cl::Device::getDefault());
@@ -339,7 +384,7 @@
 
         return 0;
     }
- * 
+ *
  * \endcode
  *
  */
@@ -3538,7 +3583,7 @@ template <class T, class Alloc, class... Args>
 cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
 {
     Alloc alloc(alloc_);
-    static const size_t copies = 1;
+    static const size_type copies = 1;
 
     // Ensure that creation of the management block and the
     // object are dealt with separately such that we only provide a deleter
@@ -6520,7 +6565,7 @@ inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned
 
         // Resize the parameter array and constituent arrays
         param->resize(numBinaries);
-        for (int i = 0; i < numBinaries; ++i) {
+        for (size_type i = 0; i < numBinaries; ++i) {
             (*param)[i].resize(sizes[i]);
         }
 
@@ -7107,7 +7152,7 @@ public:
         size_type buffer_slice_pitch,
         size_type host_row_pitch,
         size_type host_slice_pitch,
-        void *ptr,
+        const void *ptr,
         const vector<Event>* events = NULL,
         Event* event = NULL) const
     {
@@ -7255,7 +7300,7 @@ public:
         const array<size_type, 3>& region,
         size_type row_pitch,
         size_type slice_pitch,
-        void* ptr,
+        const void* ptr,
         const vector<Event>* events = NULL,
         Event* event = NULL) const
     {
@@ -7845,7 +7890,7 @@ public:
     CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
         const Kernel& kernel,
         const vector<Event>* events = NULL,
-        Event* event = NULL) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED const
+        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -8873,7 +8918,7 @@ inline cl_int enqueueWriteBufferRect(
     size_type buffer_slice_pitch,
     size_type host_row_pitch,
     size_type host_slice_pitch,
-    void *ptr,
+    const void *ptr,
     const vector<Event>* events = NULL,
     Event* event = NULL)
 {
@@ -8971,7 +9016,7 @@ inline cl_int enqueueWriteImage(
     const array<size_type, 3>& region,
     size_type row_pitch,
     size_type slice_pitch,
-    void* ptr,
+    const void* ptr,
     const vector<Event>* events = NULL,
     Event* event = NULL)
 {

From fcc9588c947b0becc587d3f021a6afbaa76a8744 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:41:40 +0100
Subject: [PATCH 052/117] Change cl2.hpp include

---
 src/OCLStream.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/OCLStream.h b/src/OCLStream.h
index 3fd144a..79a213f 100644
--- a/src/OCLStream.h
+++ b/src/OCLStream.h
@@ -8,7 +8,7 @@
 #define CL_HPP_TARGET_OPENCL_VERSION 120
 #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 
-#include "cl2.hpp"
+#include "CL/cl2.hpp"
 
 #include "Stream.h"
 

From a355acf2eee57047644ee3021557dd42d8a640b2 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:43:25 +0100
Subject: [PATCH 053/117] Move source files to top level directory

---
 src/CUDAStream.cu => CUDAStream.cu | 0
 src/CUDAStream.h => CUDAStream.h   | 0
 src/OCLStream.cpp => OCLStream.cpp | 0
 src/OCLStream.h => OCLStream.h     | 0
 src/Stream.h => Stream.h           | 0
 src/common.h.in => common.h.in     | 0
 src/main.cpp => main.cpp           | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename src/CUDAStream.cu => CUDAStream.cu (100%)
 rename src/CUDAStream.h => CUDAStream.h (100%)
 rename src/OCLStream.cpp => OCLStream.cpp (100%)
 rename src/OCLStream.h => OCLStream.h (100%)
 rename src/Stream.h => Stream.h (100%)
 rename src/common.h.in => common.h.in (100%)
 rename src/main.cpp => main.cpp (100%)

diff --git a/src/CUDAStream.cu b/CUDAStream.cu
similarity index 100%
rename from src/CUDAStream.cu
rename to CUDAStream.cu
diff --git a/src/CUDAStream.h b/CUDAStream.h
similarity index 100%
rename from src/CUDAStream.h
rename to CUDAStream.h
diff --git a/src/OCLStream.cpp b/OCLStream.cpp
similarity index 100%
rename from src/OCLStream.cpp
rename to OCLStream.cpp
diff --git a/src/OCLStream.h b/OCLStream.h
similarity index 100%
rename from src/OCLStream.h
rename to OCLStream.h
diff --git a/src/Stream.h b/Stream.h
similarity index 100%
rename from src/Stream.h
rename to Stream.h
diff --git a/src/common.h.in b/common.h.in
similarity index 100%
rename from src/common.h.in
rename to common.h.in
diff --git a/src/main.cpp b/main.cpp
similarity index 100%
rename from src/main.cpp
rename to main.cpp

From 8ce15a28aa4a2bb1b086ce9fdde3b0954885518a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 11:45:25 +0100
Subject: [PATCH 054/117] Update CMake with better binary name and source
 location

---
 CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8822353..d71ea52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,21 +8,21 @@ set(gpu-stream_VERSION_MINOR 0)
 
 list(APPEND CMAKE_CXX_FLAGS --std=c++11)
 
-configure_file(src/common.h.in src/common.h)
+configure_file(common.h.in common.h)
 
 find_package(CUDA QUIET)
 if (${CUDA_FOUND})
-  cuda_add_executable(cuda.exe src/main.cpp src/CUDAStream.cu)
-  target_compile_definitions(cuda.exe PUBLIC CUDA)
+  cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
+  target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
 else (${CUDA_FOUND})
   message("Skipping CUDA...")
 endif (${CUDA_FOUND})
 
 find_package(OpenCL QUIET)
 if (${OpenCL_FOUND})
-  add_executable(ocl.exe src/main.cpp src/OCLStream.cpp)
-  target_compile_definitions(ocl.exe PUBLIC OCL)
-  target_link_libraries(ocl.exe ${OpenCL_LIBRARY})
+  add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
+  target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
+  target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
 else (${OpenCL_FOUND})
   message("Skipping OpenCL...")
 endif (${OpenCL_FOUND})

From 1bd27428bd421b2baef47ca94e344dabe0f824a4 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 12:17:21 +0100
Subject: [PATCH 055/117] Require CUDA 7 for C++11 support

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d71ea52..a165b49 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ list(APPEND CMAKE_CXX_FLAGS --std=c++11)
 
 configure_file(common.h.in common.h)
 
-find_package(CUDA QUIET)
+find_package(CUDA 7.0 QUIET)
 if (${CUDA_FOUND})
   cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
   target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)

From 57ea4b8caefcb7d2ffc241a31bb71d0a65f1c227 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 12:17:33 +0100
Subject: [PATCH 056/117] Require CMake 3.2 so can check for C++11 nicely

---
 CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a165b49..6aa6aee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,13 +1,14 @@
 
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.2)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 project(gpu-stream)
 
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 0)
 
-list(APPEND CMAKE_CXX_FLAGS --std=c++11)
-
 configure_file(common.h.in common.h)
 
 find_package(CUDA 7.0 QUIET)

From 662fcaf4b55bc7ea275f7a5f2ea73ed3eb830c5b Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 12:18:41 +0100
Subject: [PATCH 057/117] Add CMake things to gitignore

---
 .gitignore | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index a989b77..0fa7243 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,16 @@
 
+common.h
+
 gpu-stream-cuda
 gpu-stream-ocl
+
 *.o
-
 *.tar
-
 *.gz
+
+.DS_Store
+
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+Makefile

From 21c9022a3f5a435de8c880d1ea51eb949e921a60 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 12:24:33 +0100
Subject: [PATCH 058/117] Keep C++11 flag explicitely defined in Cmake

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6aa6aee..9dda6cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,8 @@ project(gpu-stream)
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 0)
 
+list(APPEND CMAKE_CXX_FLAGS --std=c++11)
+
 configure_file(common.h.in common.h)
 
 find_package(CUDA 7.0 QUIET)

From 95a10511ec121620244bccc19785a07ae4fb6a5f Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 12:28:44 +0100
Subject: [PATCH 059/117] Update LICENSE date

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 70cebc1..b812c0c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 *==============================================================================
 *------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
+* Copyright 2016: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:

From 530b2adda2e0066e20850c97e7fede9795b65944 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 12:32:03 +0100
Subject: [PATCH 060/117] Add License text to all files

---
 CUDAStream.cu | 7 +++++++
 CUDAStream.h  | 8 ++++++++
 LICENSE       | 2 +-
 OCLStream.cpp | 6 ++++++
 OCLStream.h   | 6 ++++++
 Stream.h      | 6 ++++++
 common.h.in   | 6 ++++++
 main.cpp      | 6 ++++++
 8 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/CUDAStream.cu b/CUDAStream.cu
index 956be7d..3c10e8d 100644
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@@ -1,4 +1,11 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+
 #include "CUDAStream.h"
 
 void check_error(void)
diff --git a/CUDAStream.h b/CUDAStream.h
index 61e4882..9c436d6 100644
--- a/CUDAStream.h
+++ b/CUDAStream.h
@@ -1,4 +1,12 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
 #include <iostream>
 #include <stdexcept>
 
diff --git a/LICENSE b/LICENSE
index b812c0c..1bc1114 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 *==============================================================================
 *------------------------------------------------------------------------------
-* Copyright 2016: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
+* Copyright 2015-16: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
diff --git a/OCLStream.cpp b/OCLStream.cpp
index 5a72b3c..f7c538e 100644
--- a/OCLStream.cpp
+++ b/OCLStream.cpp
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #include "OCLStream.h"
 
 // Cache list of devices
diff --git a/OCLStream.h b/OCLStream.h
index 79a213f..cb48da5 100644
--- a/OCLStream.h
+++ b/OCLStream.h
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #pragma once
 
 #include <iostream>
diff --git a/Stream.h b/Stream.h
index e02c953..671289e 100644
--- a/Stream.h
+++ b/Stream.h
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #pragma once
 
 #include <vector>
diff --git a/common.h.in b/common.h.in
index fbf953c..1b0f38b 100644
--- a/common.h.in
+++ b/common.h.in
@@ -1,3 +1,9 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@"
 
diff --git a/main.cpp b/main.cpp
index 457ca9a..96f4e5c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #include <iostream>
 #include <vector>
 #include <numeric>

From 1a38b189542a97aff9e0b062e803ec5838364255 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 14:36:08 +0100
Subject: [PATCH 061/117] Add OpenACC version

---
 ACCStream.cpp | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++
 ACCStream.h   |  39 +++++++++++++++++
 main.cpp      |   7 ++++
 3 files changed, 159 insertions(+)
 create mode 100644 ACCStream.cpp
 create mode 100644 ACCStream.h

diff --git a/ACCStream.cpp b/ACCStream.cpp
new file mode 100644
index 0000000..ca5ef7b
--- /dev/null
+++ b/ACCStream.cpp
@@ -0,0 +1,113 @@
+
+#include "ACCStream.h"
+
+template <class T>
+ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
+{
+  array_size = ARRAY_SIZE;
+
+  // Set up data region on device
+  this->a = a;
+  this->b = b;
+  this->c = c;
+  #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+ACCStream<T>::~ACCStream()
+{
+  // End data region on device
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::copy()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *c = this->c;
+  #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::mul()
+{
+  const T scalar = 3.0;
+
+  unsigned int array_size = this->array_size;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::add()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::triad()
+{
+  const T scalar = 3.0;
+
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+void listDevices(void)
+{
+  // Get number of devices
+}
+
+template class ACCStream<float>;
+template class ACCStream<double>;
+
diff --git a/ACCStream.h b/ACCStream.h
new file mode 100644
index 0000000..4c69986
--- /dev/null
+++ b/ACCStream.h
@@ -0,0 +1,39 @@
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#include <openacc.h>
+
+#define IMPLEMENTATION_STRING "OpenACC"
+
+template <class T>
+class ACCStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    ACCStream(const unsigned int, T*, T*, T*);
+    ~ACCStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+
+  
+};
+
diff --git a/main.cpp b/main.cpp
index 96f4e5c..209fc98 100644
--- a/main.cpp
+++ b/main.cpp
@@ -22,6 +22,9 @@
 #include "CUDAStream.h"
 #elif defined(OCL)
 #include "OCLStream.h"
+#elif defined(ACC)
+#include "ACCStream.h"
+
 #endif
 
 
@@ -80,6 +83,10 @@ void run()
   // Use the OpenCL implementation
   stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
 
+#elif defined(ACC)
+  // Use the OpenACC implementation
+  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
+
 #endif
 
   stream->write_arrays(a, b, c);

From da4f91878834f3e2e3b907003eefcb77538e0f9b Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Tue, 3 May 2016 14:45:13 +0100
Subject: [PATCH 062/117] Add initial SYCL implementation

---
 CMakeLists.txt | 8 ++++++++
 main.cpp       | 8 ++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dda6cd..d43a22a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,3 +30,11 @@ else (${OpenCL_FOUND})
   message("Skipping OpenCL...")
 endif (${OpenCL_FOUND})
 
+# TODO: Find SYCL implementations somehow
+if (true)
+  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
+  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
+  set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
+else ()
+  message("Skipping SYCL...")
+endif ()
diff --git a/main.cpp b/main.cpp
index 209fc98..f12a7e1 100644
--- a/main.cpp
+++ b/main.cpp
@@ -24,7 +24,8 @@
 #include "OCLStream.h"
 #elif defined(ACC)
 #include "ACCStream.h"
-
+#elif defined(SYCL)
+#include "SYCLStream.h"
 #endif
 
 
@@ -87,6 +88,10 @@ void run()
   // Use the OpenACC implementation
   stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
 
+#elif defined(SYCL)
+  // Use the SYCL implementation
+  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
+
 #endif
 
   stream->write_arrays(a, b, c);
@@ -285,4 +290,3 @@ void parseArguments(int argc, char *argv[])
     }
   }
 }
-

From 40a0a6551dabbe7a810639275aa60ca9e64ae343 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Tue, 3 May 2016 14:46:08 +0100
Subject: [PATCH 063/117] Remove extra -std=c++11 from CMake build

---
 CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d43a22a..fd9a10d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,8 +9,6 @@ project(gpu-stream)
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 0)
 
-list(APPEND CMAKE_CXX_FLAGS --std=c++11)
-
 configure_file(common.h.in common.h)
 
 find_package(CUDA 7.0 QUIET)

From b45f311e0d4e6f3303e78c5327d86b59f9827dea Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Tue, 3 May 2016 14:48:35 +0100
Subject: [PATCH 064/117] Add missing SYCL source files

---
 SYCLStream.cpp | 144 +++++++++++++++++++++++++++++++++++++++++++++++++
 SYCLStream.h   |  45 ++++++++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 SYCLStream.cpp
 create mode 100644 SYCLStream.h

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
new file mode 100644
index 0000000..1314aa4
--- /dev/null
+++ b/SYCLStream.cpp
@@ -0,0 +1,144 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "SYCLStream.h"
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+template <class T>
+SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
+{
+  array_size = ARRAY_SIZE;
+
+  // Create buffers
+  d_a = buffer<T>(array_size);
+  d_b = buffer<T>(array_size);
+  d_c = buffer<T>(array_size);
+}
+
+template <class T>
+SYCLStream<T>::~SYCLStream()
+{
+}
+
+template <class T>
+void SYCLStream<T>::copy()
+{
+  queue.submit([&](handler &cgh)
+  {
+    auto ka = d_a.template get_access<access::read>(cgh);
+    auto kc = d_c.template get_access<access::write>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    {
+      kc[index] = ka[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::mul()
+{
+  const T scalar = 3.0;
+  queue.submit([&](handler &cgh)
+  {
+    auto kb = d_b.template get_access<access::write>(cgh);
+    auto kc = d_c.template get_access<access::read>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    {
+      kb[index] = scalar * kc[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::add()
+{
+  queue.submit([&](handler &cgh)
+  {
+    auto ka = d_a.template get_access<access::read>(cgh);
+    auto kb = d_b.template get_access<access::read>(cgh);
+    auto kc = d_c.template get_access<access::write>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    {
+      kc[index] = ka[index] + kb[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::triad()
+{
+  const T scalar = 3.0;
+  queue.submit([&](handler &cgh)
+  {
+    auto ka = d_a.template get_access<access::write>(cgh);
+    auto kb = d_b.template get_access<access::read>(cgh);
+    auto kc = d_c.template get_access<access::read>(cgh);
+    cgh.parallel_for(range<1>{array_size}, [=](id<1> index){
+      ka[index] = kb[index] + scalar * kc[index];
+    });
+  });
+  queue.wait();
+}
+
+template <class T>
+void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  auto _a = d_a.template get_access<access::write>();
+  auto _b = d_b.template get_access<access::write>();
+  auto _c = d_c.template get_access<access::write>();
+  for (int i = 0; i < array_size; i++)
+  {
+    _a[i] = a[i];
+    _b[i] = b[i];
+    _c[i] = c[i];
+  }
+}
+
+template <class T>
+void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  auto _a = d_a.template get_access<access::read>();
+  auto _b = d_b.template get_access<access::read>();
+  auto _c = d_c.template get_access<access::read>();
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = _a[i];
+    b[i] = _b[i];
+    c[i] = _c[i];
+  }
+}
+
+void listDevices(void)
+{
+  // TODO: Get actual list of devices
+  std::cout << std::endl;
+  std::cout << "Devices:" << std::endl;
+  std::cout << "0: " << "triSYCL" << std::endl;
+  std::cout << std::endl;
+}
+
+std::string getDeviceName(const int device)
+{
+  // TODO: Implement properly
+  return "triSYCL";
+}
+
+std::string getDeviceDriver(const int device)
+{
+  // TODO: Implement properly
+  return "triSCYL";
+}
+
+
+template class SYCLStream<float>;
+template class SYCLStream<double>;
diff --git a/SYCLStream.h b/SYCLStream.h
new file mode 100644
index 0000000..e10535e
--- /dev/null
+++ b/SYCLStream.h
@@ -0,0 +1,45 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include "Stream.h"
+
+#include "CL/sycl.hpp"
+
+#define IMPLEMENTATION_STRING "SYCL"
+
+template <class T>
+class SYCLStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // SYCL objects
+    cl::sycl::queue queue;
+    cl::sycl::buffer<T> d_a;
+    cl::sycl::buffer<T> d_b;
+    cl::sycl::buffer<T> d_c;
+
+  public:
+
+    SYCLStream(const unsigned int, const int);
+    ~SYCLStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
+// Populate the devices list
+void getDeviceList(void);

From 0b0de4e0c3c17079d9b9cac3ab64ac0bb09326ff Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 14:50:09 +0100
Subject: [PATCH 065/117] Implement the OpenACC device string functions, and
 device selector

---
 ACCStream.cpp | 25 ++++++++++++++++++++++++-
 ACCStream.h   |  2 +-
 main.cpp      |  2 +-
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/ACCStream.cpp b/ACCStream.cpp
index ca5ef7b..85bf600 100644
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@@ -2,8 +2,11 @@
 #include "ACCStream.h"
 
 template <class T>
-ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
+ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
 {
+
+  acc_set_device_num(device, acc_device_nvidia);
+
   array_size = ARRAY_SIZE;
 
   // Set up data region on device
@@ -106,8 +109,28 @@ void ACCStream<T>::triad()
 void listDevices(void)
 {
   // Get number of devices
+  int count = acc_get_num_devices(acc_device_nvidia);
+
+  // Print device list
+  if (count == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << "There are " << count << " devices." << std::endl;
+  }
 }
 
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
 template class ACCStream<float>;
 template class ACCStream<double>;
 
diff --git a/ACCStream.h b/ACCStream.h
index 4c69986..d6e5728 100644
--- a/ACCStream.h
+++ b/ACCStream.h
@@ -22,7 +22,7 @@ class ACCStream : public Stream<T>
     T *c;
 
   public:
-    ACCStream(const unsigned int, T*, T*, T*);
+    ACCStream(const unsigned int, T*, T*, T*, int);
     ~ACCStream();
 
     virtual void copy() override;
diff --git a/main.cpp b/main.cpp
index f12a7e1..59411b8 100644
--- a/main.cpp
+++ b/main.cpp
@@ -86,7 +86,7 @@ void run()
 
 #elif defined(ACC)
   // Use the OpenACC implementation
-  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
+  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 
 #elif defined(SYCL)
   // Use the SYCL implementation

From 31819b7778fd4fd7f99c3450b73e6b5f13630629 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 15:07:51 +0100
Subject: [PATCH 066/117] Add bones of OpenACC in CMake config

---
 CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd9a10d..1a15bc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,15 @@ else (${OpenCL_FOUND})
   message("Skipping OpenCL...")
 endif (${OpenCL_FOUND})
 
+# TODO: Find OpenACC implementations somehow
+if (true)
+  add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
+  target_compile_definitions(gpu-stream-acc PUBLIC ACC)
+  target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11")
+else ()
+  message("Skipping OpenACC...")
+endif ()
+
 # TODO: Find SYCL implementations somehow
 if (true)
   add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)

From 48cae0cbc1bd0bc667f87f43f19978214b81efe1 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 3 May 2016 15:20:26 +0100
Subject: [PATCH 067/117] Make sure CUDA nvcc builds with C++11

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a15bc0..71a95d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,7 @@ configure_file(common.h.in common.h)
 
 find_package(CUDA 7.0 QUIET)
 if (${CUDA_FOUND})
+  list(APPEND CUDA_NVCC_FLAGS --std=c++11)
   cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
   target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
 else (${CUDA_FOUND})

From f0afa0c1e43c43729dd26cca719d1db21844d279 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 4 May 2016 10:41:41 +0100
Subject: [PATCH 068/117] Add reference OpenMP 3.0 version

---
 OMP3Stream.cpp | 106 +++++++++++++++++++++++++++++++++++++++++++++++++
 OMP3Stream.h   |  35 ++++++++++++++++
 main.cpp       |   7 ++++
 3 files changed, 148 insertions(+)
 create mode 100644 OMP3Stream.cpp
 create mode 100644 OMP3Stream.h

diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp
new file mode 100644
index 0000000..8899cff
--- /dev/null
+++ b/OMP3Stream.cpp
@@ -0,0 +1,106 @@
+
+#include "OMP3Stream.h"
+
+template <class T>
+OMP3Stream<T>::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
+{
+  array_size = ARRAY_SIZE;
+  this->a = (T*)malloc(sizeof(T)*array_size);
+  this->b = (T*)malloc(sizeof(T)*array_size);
+  this->c = (T*)malloc(sizeof(T)*array_size);
+}
+
+template <class T>
+OMP3Stream<T>::~OMP3Stream()
+{
+  free(a);
+  free(b);
+  free(c);
+}
+
+
+template <class T>
+void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = h_a[i];
+    b[i] = h_b[i];
+    c[i] = h_c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    h_a[i] = a[i];
+    h_b[i] = b[i];
+    h_c[i] = c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::copy()
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::mul()
+{
+  const T scalar = 3.0;
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::add()
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::triad()
+{
+  const T scalar = 3.0;
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+
+void listDevices(void)
+{
+  std::cout << "0: CPU" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+
+
+template class OMP3Stream<float>;
+template class OMP3Stream<double>;
+
diff --git a/OMP3Stream.h b/OMP3Stream.h
new file mode 100644
index 0000000..15172c3
--- /dev/null
+++ b/OMP3Stream.h
@@ -0,0 +1,35 @@
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "Reference OpenMP"
+
+template <class T>
+class OMP3Stream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    OMP3Stream(const unsigned int, T*, T*, T*);
+    ~OMP3Stream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
diff --git a/main.cpp b/main.cpp
index 59411b8..18b8c72 100644
--- a/main.cpp
+++ b/main.cpp
@@ -26,6 +26,8 @@
 #include "ACCStream.h"
 #elif defined(SYCL)
 #include "SYCLStream.h"
+#elif defined(OMP3)
+#include "OMP3Stream.h"
 #endif
 
 
@@ -92,6 +94,11 @@ void run()
   // Use the SYCL implementation
   stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
 
+#elif defined(OMP3)
+  // Use the "reference" OpenMP 3 implementation
+  stream = new OMP3Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
+
+
 #endif
 
   stream->write_arrays(a, b, c);

From 7c28a6386b1905e154815afcfc0f26e1458aaa3d Mon Sep 17 00:00:00 2001
From: Matthew Martineau <m.martineau@hotmail.co.uk>
Date: Thu, 5 May 2016 17:22:29 +0100
Subject: [PATCH 069/117] Added the Kokkos and RAJA implementations

---
 main.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index 96f4e5c..1a92ebc 100644
--- a/main.cpp
+++ b/main.cpp
@@ -22,9 +22,12 @@
 #include "CUDAStream.h"
 #elif defined(OCL)
 #include "OCLStream.h"
+#elif defined(RAJA)
+#include "RAJAStream.hpp"
+#elif defined(KOKKOS)
+#include "KOKKOSStream.hpp"
 #endif
 
-
 unsigned int ARRAY_SIZE = 52428800;
 unsigned int num_times = 10;
 unsigned int deviceIndex = 0;
@@ -80,6 +83,14 @@ void run()
   // Use the OpenCL implementation
   stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
 
+#elif defined(RAJA)
+  // Use the RAJA implementation
+  stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(KOKKOS)
+  // Use the Kokkos implementation
+  stream = new KOKKOSStream<T>(ARRAY_SIZE, deviceIndex);
+
 #endif
 
   stream->write_arrays(a, b, c);

From d4b3b3533ce2594eec92d85a937081ba15a17734 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 6 May 2016 00:38:30 +0100
Subject: [PATCH 070/117] Update SYCL version to work with ComputeCpp

Still needs proper CMake rules and kernel names need to be fixed for
multiple template instantiations.
---
 CMakeLists.txt | 15 ++++++++++++---
 SYCLStream.cpp | 52 +++++++++++++++++++++++++++-----------------------
 SYCLStream.h   |  6 +++---
 main.cpp       |  3 +++
 4 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71a95d6..b078ab5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,9 +40,18 @@ endif ()
 
 # TODO: Find SYCL implementations somehow
 if (true)
-  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
-  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
-  set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
+  # ComputeCpp
+  # TODO: Sort this out properly!
+  add_custom_target(gpu-stream-sycl
+                    COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -O2 -emit-llvm -o SYCLStream.bc -c
+                    COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -O2 -std=c++11 -include SYCLStream.sycl -o SYCLStream.o -c
+                    COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp -O2 -std=c++11 SYCLStream.o -include SYCLStream.sycl -lSYCL -lOpenCL -o gpu-stream-sycl -DSYCL
+                    )
+
+  # triSYCL
+  #add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
+  #target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
+  #set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
 else ()
   message("Skipping SYCL...")
 endif ()
diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index 1314aa4..1c4ed17 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -17,14 +17,17 @@ SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
   array_size = ARRAY_SIZE;
 
   // Create buffers
-  d_a = buffer<T>(array_size);
-  d_b = buffer<T>(array_size);
-  d_c = buffer<T>(array_size);
+  d_a = new buffer<T>(array_size);
+  d_b = new buffer<T>(array_size);
+  d_c = new buffer<T>(array_size);
 }
 
 template <class T>
 SYCLStream<T>::~SYCLStream()
 {
+  delete d_a;
+  delete d_b;
+  delete d_c;
 }
 
 template <class T>
@@ -32,9 +35,9 @@ void SYCLStream<T>::copy()
 {
   queue.submit([&](handler &cgh)
   {
-    auto ka = d_a.template get_access<access::read>(cgh);
-    auto kc = d_c.template get_access<access::write>(cgh);
-    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    auto ka = d_a->template get_access<access::mode::read>(cgh);
+    auto kc = d_c->template get_access<access::mode::write>(cgh);
+    cgh.parallel_for<class copy>(range<1>{array_size}, [=](id<1> index)
     {
       kc[index] = ka[index];
     });
@@ -48,9 +51,9 @@ void SYCLStream<T>::mul()
   const T scalar = 3.0;
   queue.submit([&](handler &cgh)
   {
-    auto kb = d_b.template get_access<access::write>(cgh);
-    auto kc = d_c.template get_access<access::read>(cgh);
-    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    auto kb = d_b->template get_access<access::mode::write>(cgh);
+    auto kc = d_c->template get_access<access::mode::read>(cgh);
+    cgh.parallel_for<class mul>(range<1>{array_size}, [=](id<1> index)
     {
       kb[index] = scalar * kc[index];
     });
@@ -63,10 +66,10 @@ void SYCLStream<T>::add()
 {
   queue.submit([&](handler &cgh)
   {
-    auto ka = d_a.template get_access<access::read>(cgh);
-    auto kb = d_b.template get_access<access::read>(cgh);
-    auto kc = d_c.template get_access<access::write>(cgh);
-    cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
+    auto ka = d_a->template get_access<access::mode::read>(cgh);
+    auto kb = d_b->template get_access<access::mode::read>(cgh);
+    auto kc = d_c->template get_access<access::mode::write>(cgh);
+    cgh.parallel_for<class add>(range<1>{array_size}, [=](id<1> index)
     {
       kc[index] = ka[index] + kb[index];
     });
@@ -80,10 +83,10 @@ void SYCLStream<T>::triad()
   const T scalar = 3.0;
   queue.submit([&](handler &cgh)
   {
-    auto ka = d_a.template get_access<access::write>(cgh);
-    auto kb = d_b.template get_access<access::read>(cgh);
-    auto kc = d_c.template get_access<access::read>(cgh);
-    cgh.parallel_for(range<1>{array_size}, [=](id<1> index){
+    auto ka = d_a->template get_access<access::mode::write>(cgh);
+    auto kb = d_b->template get_access<access::mode::read>(cgh);
+    auto kc = d_c->template get_access<access::mode::read>(cgh);
+    cgh.parallel_for<class triad>(range<1>{array_size}, [=](id<1> index){
       ka[index] = kb[index] + scalar * kc[index];
     });
   });
@@ -93,9 +96,9 @@ void SYCLStream<T>::triad()
 template <class T>
 void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
-  auto _a = d_a.template get_access<access::write>();
-  auto _b = d_b.template get_access<access::write>();
-  auto _c = d_c.template get_access<access::write>();
+  auto _a = d_a->template get_access<access::mode::write, access::target::host_buffer>();
+  auto _b = d_b->template get_access<access::mode::write, access::target::host_buffer>();
+  auto _c = d_c->template get_access<access::mode::write, access::target::host_buffer>();
   for (int i = 0; i < array_size; i++)
   {
     _a[i] = a[i];
@@ -107,9 +110,9 @@ void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>&
 template <class T>
 void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
-  auto _a = d_a.template get_access<access::read>();
-  auto _b = d_b.template get_access<access::read>();
-  auto _c = d_c.template get_access<access::read>();
+  auto _a = d_a->template get_access<access::mode::read, access::target::host_buffer>();
+  auto _b = d_b->template get_access<access::mode::read, access::target::host_buffer>();
+  auto _c = d_c->template get_access<access::mode::read, access::target::host_buffer>();
   for (int i = 0; i < array_size; i++)
   {
     a[i] = _a[i];
@@ -140,5 +143,6 @@ std::string getDeviceDriver(const int device)
 }
 
 
-template class SYCLStream<float>;
+// TODO: Fix kernel names to allow multiple template specializations
+//template class SYCLStream<float>;
 template class SYCLStream<double>;
diff --git a/SYCLStream.h b/SYCLStream.h
index e10535e..f4d79d5 100644
--- a/SYCLStream.h
+++ b/SYCLStream.h
@@ -22,9 +22,9 @@ class SYCLStream : public Stream<T>
 
     // SYCL objects
     cl::sycl::queue queue;
-    cl::sycl::buffer<T> d_a;
-    cl::sycl::buffer<T> d_b;
-    cl::sycl::buffer<T> d_c;
+    cl::sycl::buffer<T> *d_a;
+    cl::sycl::buffer<T> *d_b;
+    cl::sycl::buffer<T> *d_c;
 
   public:
 
diff --git a/main.cpp b/main.cpp
index 18b8c72..e0911d2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -54,9 +54,12 @@ int main(int argc, char *argv[])
 
   parseArguments(argc, argv);
 
+  // TODO: Fix SYCL to allow multiple template specializations
+#ifndef SYCL
   if (use_float)
     run<float>();
   else
+#endif
     run<double>();
 
 }

From 45381da0b2cc473ad6b562394c422a3176b21f83 Mon Sep 17 00:00:00 2001
From: Matthew Martineau <m.martineau@hotmail.co.uk>
Date: Fri, 6 May 2016 10:46:35 +0100
Subject: [PATCH 071/117] Initial commit of in progress developments of RAJA
 and KOKKOS stream

---
 KOKKOSStream.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++
 KOKKOSStream.hpp |  53 ++++++++++++++++++++
 RAJAStream.cpp   | 105 ++++++++++++++++++++++++++++++++++++++++
 RAJAStream.hpp   |  58 ++++++++++++++++++++++
 4 files changed, 339 insertions(+)
 create mode 100644 KOKKOSStream.cpp
 create mode 100644 KOKKOSStream.hpp
 create mode 100644 RAJAStream.cpp
 create mode 100644 RAJAStream.hpp

diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
new file mode 100644
index 0000000..c4d548b
--- /dev/null
+++ b/KOKKOSStream.cpp
@@ -0,0 +1,123 @@
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+
+#include "KOKKOSStream.hpp"
+
+using Kokkos::parallel_for;
+
+template <class T>
+KOKKOSStream<T>::KOKKOSStream(
+        const unsigned int ARRAY_SIZE, const int device_index)
+    : array_size(ARRAY_SIZE)
+{
+  Kokkos::initialize();
+
+  new(d_a) Kokkos::View<double*, DEVICE>("d_a", ARRAY_SIZE);
+  new(d_b) Kokkos::View<double*, DEVICE>("d_b", ARRAY_SIZE);
+  new(d_c) Kokkos::View<double*, DEVICE>("d_c", ARRAY_SIZE);
+  new(hm_a) Kokkos::View<double*>::HostMirror();
+  new(hm_b) Kokkos::View<double*>::HostMirror();
+  new(hm_c) Kokkos::View<double*>::HostMirror();
+  hm_a = Kokkos::create_mirror_view(d_a);
+  hm_b = Kokkos::create_mirror_view(d_b);
+  hm_c = Kokkos::create_mirror_view(d_c);
+}
+
+template <class T>
+KOKKOSStream<T>::~KOKKOSStream()
+{
+  Kokkos::finalize();
+}
+
+template <class T>
+void KOKKOSStream<T>::write_arrays(
+        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  for(int ii = 0; ii < array_size; ++ii)
+  {
+    hm_a(ii) = a[ii];
+    hm_b(ii) = b[ii];
+    hm_c(ii) = c[ii];
+  }
+  Kokkos::deep_copy(hm_a, d_a);
+  Kokkos::deep_copy(hm_b, d_b);
+  Kokkos::deep_copy(hm_c, d_c);
+}
+
+template <class T>
+void KOKKOSStream<T>::read_arrays(
+        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  Kokkos::deep_copy(d_a, hm_a);
+  Kokkos::deep_copy(d_a, hm_b);
+  Kokkos::deep_copy(d_a, hm_c);
+  for(int ii = 0; ii < array_size; ++ii)
+  {
+    a[ii] = hm_a(ii);
+    b[ii] = hm_b(ii);
+    c[ii] = hm_c(ii);
+  }
+}
+
+template <class T>
+void KOKKOSStream<T>::copy()
+{
+  Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  {
+    d_c[index] = d_a[index];
+  });
+}
+
+template <class T>
+void KOKKOSStream<T>::mul()
+{
+  const T scalar = 3.0;
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  {
+    d_b[index] = scalar*d_c[index];
+  });
+}
+
+template <class T>
+void KOKKOSStream<T>::add()
+{
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  {
+    d_c[index] = d_a[index] + d_b[index];
+  });
+}
+
+template <class T>
+void KOKKOSStream<T>::triad()
+{
+  const T scalar = 3.0;
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  {
+    d_a[index] = d_b[index] + scalar*d_c[index];
+  });
+}
+
+void listDevices(void)
+{
+  std::cout << "This is not the device you are looking for.";
+}
+
+
+std::string getDeviceName(const int device)
+{
+  return "Kokkos";
+}
+
+
+std::string getDeviceDriver(const int device)
+{
+  return "Kokkos";
+}
+
+template class KOKKOSStream<float>;
+template class KOKKOSStream<double>;
+
diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp
new file mode 100644
index 0000000..632ca20
--- /dev/null
+++ b/KOKKOSStream.hpp
@@ -0,0 +1,53 @@
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_View.hpp>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "KOKKOS"
+
+#define DEVICE Kokkos::OpenMP
+
+
+template <class T>
+class KOKKOSStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Device side pointers to arrays
+    Kokkos::View<double*, DEVICE> d_a;
+    Kokkos::View<double*, DEVICE> d_b;
+    Kokkos::View<double*, DEVICE> d_c;
+    Kokkos::View<double*>::HostMirror hm_a;
+    Kokkos::View<double*>::HostMirror hm_b;
+    Kokkos::View<double*>::HostMirror hm_c;
+
+  public:
+
+    KOKKOSStream(const unsigned int, const int);
+    ~KOKKOSStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(
+            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(
+            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+};
+
diff --git a/RAJAStream.cpp b/RAJAStream.cpp
new file mode 100644
index 0000000..5b1c980
--- /dev/null
+++ b/RAJAStream.cpp
@@ -0,0 +1,105 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "RAJAStream.hpp"
+
+using RAJA::forall;
+using RAJA::RangeSegment;
+
+template <class T>
+RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
+    : array_size(ARRAY_SIZE)
+{
+  RangeSegment seg(0, ARRAY_SIZE);
+  index_set.push_back(seg);
+  d_a = new T[ARRAY_SIZE];
+  d_b = new T[ARRAY_SIZE];
+  d_c = new T[ARRAY_SIZE];
+}
+
+template <class T>
+RAJAStream<T>::~RAJAStream()
+{
+  delete[] d_a;
+  delete[] d_b;
+  delete[] d_c;
+}
+
+template <class T>
+void RAJAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  std::copy(a.begin(), a.end(), d_a);
+  std::copy(b.begin(), b.end(), d_b);
+  std::copy(c.begin(), c.end(), d_c);
+}
+
+template <class T>
+void RAJAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  std::copy(d_a, d_a + array_size - 1, a.data());
+  std::copy(d_b, d_b + array_size - 1, b.data());
+  std::copy(d_c, d_c + array_size - 1, c.data());
+}
+
+template <class T>
+void RAJAStream<T>::copy()
+{
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  {
+    d_c[index] = d_a[index];
+  });
+}
+
+template <class T>
+void RAJAStream<T>::mul()
+{
+  const T scalar = 3.0;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  {
+    d_b[index] = scalar*d_c[index];
+  });
+}
+
+template <class T>
+void RAJAStream<T>::add()
+{
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  {
+    d_c[index] = d_a[index] + d_b[index];
+  });
+}
+
+template <class T>
+void RAJAStream<T>::triad()
+{
+  const T scalar = 3.0;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  {
+    d_a[index] = d_b[index] + scalar*d_c[index];
+  });
+}
+
+void listDevices(void)
+{
+  std::cout << "This is not the device you are looking for.";
+}
+
+
+std::string getDeviceName(const int device)
+{
+  return "RAJA";
+}
+
+
+std::string getDeviceDriver(const int device)
+{
+  return "RAJA";
+}
+
+template class RAJAStream<float>;
+template class RAJAStream<double>;
+
diff --git a/RAJAStream.hpp b/RAJAStream.hpp
new file mode 100644
index 0000000..a41c60e
--- /dev/null
+++ b/RAJAStream.hpp
@@ -0,0 +1,58 @@
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include "RAJA/RAJA.hxx"
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "RAJA"
+
+#ifdef RAJA_USE_CUDA
+const size_t block_size = 128;
+typedef RAJA::IndexSet::ExecPolicy<
+        RAJA::seq_segit,
+        RAJA::cuda_exec_async<block_size>> policy;
+#else
+typedef RAJA::IndexSet::ExecPolicy<
+        RAJA::seq_segit,
+        RAJA::omp_parallel_for_exec> policy;
+#endif
+
+template <class T>
+class RAJAStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Contains iteration space
+    RAJA::IndexSet index_set;
+
+    // Device side pointers to arrays
+    T* d_a;
+    T* d_b;
+    T* d_c;
+
+  public:
+
+    RAJAStream(const unsigned int, const int);
+    ~RAJAStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(
+            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(
+            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+};
+

From 3b266b826610abbc8dd3ea3f94bf076a30a758ef Mon Sep 17 00:00:00 2001
From: Matthew Martineau <m.martineau@hotmail.co.uk>
Date: Fri, 6 May 2016 10:51:35 +0100
Subject: [PATCH 072/117] Fix for namespace collision with #define RAJA

---
 main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.cpp b/main.cpp
index 5d3a559..f67fdd6 100644
--- a/main.cpp
+++ b/main.cpp
@@ -22,7 +22,7 @@
 #include "CUDAStream.h"
 #elif defined(OCL)
 #include "OCLStream.h"
-#elif defined(RAJA)
+#elif defined(USE_RAJA)
 #include "RAJAStream.hpp"
 #elif defined(KOKKOS)
 #include "KOKKOSStream.hpp"
@@ -89,7 +89,7 @@ void run()
   // Use the OpenCL implementation
   stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
 
-#elif defined(RAJA)
+#elif defined(USE_RAJA)
   // Use the RAJA implementation
   stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
 

From 1a60f130eba160091f5cd5915caeca9ed4e536f9 Mon Sep 17 00:00:00 2001
From: Matthew Martineau <m.martineau@hotmail.co.uk>
Date: Fri, 6 May 2016 13:17:04 +0100
Subject: [PATCH 073/117] Fixed memory management for GPU, now working with
 OpenMP and CUDA

---
 RAJAStream.cpp | 44 +++++++++++++++++++++++++++++++++++---------
 RAJAStream.hpp | 12 ++++++------
 2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 5b1c980..eb98d54 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -16,21 +16,36 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
   RangeSegment seg(0, ARRAY_SIZE);
   index_set.push_back(seg);
+
+#ifdef RAJA_TARGET_CPU
   d_a = new T[ARRAY_SIZE];
   d_b = new T[ARRAY_SIZE];
   d_c = new T[ARRAY_SIZE];
+#else
+  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaDeviceSynchronize();
+#endif
 }
 
 template <class T>
 RAJAStream<T>::~RAJAStream()
 {
+#ifdef RAJA_TARGET_CPU
   delete[] d_a;
   delete[] d_b;
   delete[] d_c;
+#else
+  cudaFree(d_a);
+  cudaFree(d_b);
+  cudaFree(d_c);
+#endif
 }
 
 template <class T>
-void RAJAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+void RAJAStream<T>::write_arrays(
+        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
   std::copy(a.begin(), a.end(), d_a);
   std::copy(b.begin(), b.end(), d_b);
@@ -38,48 +53,59 @@ void RAJAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>&
 }
 
 template <class T>
-void RAJAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+void RAJAStream<T>::read_arrays(
+        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
-  std::copy(d_a, d_a + array_size - 1, a.data());
-  std::copy(d_b, d_b + array_size - 1, b.data());
-  std::copy(d_c, d_c + array_size - 1, c.data());
+  std::copy(d_a, d_a + array_size, a.data());
+  std::copy(d_b, d_b + array_size, b.data());
+  std::copy(d_c, d_c + array_size, c.data());
 }
 
 template <class T>
 void RAJAStream<T>::copy()
 {
+  T* a = d_a;
+  T* c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
   {
-    d_c[index] = d_a[index];
+    c[index] = a[index];
   });
 }
 
 template <class T>
 void RAJAStream<T>::mul()
 {
+  T* b = d_b;
+  T* c = d_c;
   const T scalar = 3.0;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
   {
-    d_b[index] = scalar*d_c[index];
+    b[index] = scalar*c[index];
   });
 }
 
 template <class T>
 void RAJAStream<T>::add()
 {
+  T* a = d_a;
+  T* b = d_b;
+  T* c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
   {
-    d_c[index] = d_a[index] + d_b[index];
+    c[index] = a[index] + b[index];
   });
 }
 
 template <class T>
 void RAJAStream<T>::triad()
 {
+  T* a = d_a;
+  T* b = d_b;
+  T* c = d_c;
   const T scalar = 3.0;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
   {
-    d_a[index] = d_b[index] + scalar*d_c[index];
+    a[index] = b[index] + scalar*c[index];
   });
 }
 
diff --git a/RAJAStream.hpp b/RAJAStream.hpp
index a41c60e..454e20e 100644
--- a/RAJAStream.hpp
+++ b/RAJAStream.hpp
@@ -14,15 +14,15 @@
 
 #define IMPLEMENTATION_STRING "RAJA"
 
-#ifdef RAJA_USE_CUDA
-const size_t block_size = 128;
-typedef RAJA::IndexSet::ExecPolicy<
-        RAJA::seq_segit,
-        RAJA::cuda_exec_async<block_size>> policy;
-#else
+#ifdef RAJA_TARGET_CPU
 typedef RAJA::IndexSet::ExecPolicy<
         RAJA::seq_segit,
         RAJA::omp_parallel_for_exec> policy;
+#else
+const size_t block_size = 128;
+typedef RAJA::IndexSet::ExecPolicy<
+        RAJA::seq_segit,
+        RAJA::cuda_exec<block_size>> policy;
 #endif
 
 template <class T>

From 894829cb05143c908923a831e9f0d13072ef640a Mon Sep 17 00:00:00 2001
From: Matthew Martineau <m.martineau@hotmail.co.uk>
Date: Fri, 6 May 2016 21:02:44 +0100
Subject: [PATCH 074/117] Adjusted the Kokkos implementation to fix view
 initialisation, and store local copies of views for lambda scoping

---
 KOKKOSStream.cpp | 74 +++++++++++++++++++++++++++++-------------------
 KOKKOSStream.hpp | 19 +++++++------
 main.cpp         |  2 ++
 3 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index c4d548b..3834081 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -7,7 +7,7 @@
 
 #include "KOKKOSStream.hpp"
 
-using Kokkos::parallel_for;
+using namespace Kokkos;
 
 template <class T>
 KOKKOSStream<T>::KOKKOSStream(
@@ -16,21 +16,21 @@ KOKKOSStream<T>::KOKKOSStream(
 {
   Kokkos::initialize();
 
-  new(d_a) Kokkos::View<double*, DEVICE>("d_a", ARRAY_SIZE);
-  new(d_b) Kokkos::View<double*, DEVICE>("d_b", ARRAY_SIZE);
-  new(d_c) Kokkos::View<double*, DEVICE>("d_c", ARRAY_SIZE);
-  new(hm_a) Kokkos::View<double*>::HostMirror();
-  new(hm_b) Kokkos::View<double*>::HostMirror();
-  new(hm_c) Kokkos::View<double*>::HostMirror();
-  hm_a = Kokkos::create_mirror_view(d_a);
-  hm_b = Kokkos::create_mirror_view(d_b);
-  hm_c = Kokkos::create_mirror_view(d_c);
+  d_a = new View<double*, DEVICE>("d_a", ARRAY_SIZE);
+  d_b = new View<double*, DEVICE>("d_b", ARRAY_SIZE);
+  d_c = new View<double*, DEVICE>("d_c", ARRAY_SIZE);
+  hm_a = new View<double*, DEVICE>::HostMirror();
+  hm_b = new View<double*, DEVICE>::HostMirror();
+  hm_c = new View<double*, DEVICE>::HostMirror();
+  *hm_a = create_mirror_view(*d_a);
+  *hm_b = create_mirror_view(*d_b);
+  *hm_c = create_mirror_view(*d_c);
 }
 
 template <class T>
 KOKKOSStream<T>::~KOKKOSStream()
 {
-  Kokkos::finalize();
+  finalize();
 }
 
 template <class T>
@@ -39,65 +39,81 @@ void KOKKOSStream<T>::write_arrays(
 {
   for(int ii = 0; ii < array_size; ++ii)
   {
-    hm_a(ii) = a[ii];
-    hm_b(ii) = b[ii];
-    hm_c(ii) = c[ii];
+    (*hm_a)(ii) = a[ii];
+    (*hm_b)(ii) = b[ii];
+    (*hm_c)(ii) = c[ii];
   }
-  Kokkos::deep_copy(hm_a, d_a);
-  Kokkos::deep_copy(hm_b, d_b);
-  Kokkos::deep_copy(hm_c, d_c);
+  deep_copy(*hm_a, *d_a);
+  deep_copy(*hm_b, *d_b);
+  deep_copy(*hm_c, *d_c);
 }
 
 template <class T>
 void KOKKOSStream<T>::read_arrays(
         std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
-  Kokkos::deep_copy(d_a, hm_a);
-  Kokkos::deep_copy(d_a, hm_b);
-  Kokkos::deep_copy(d_a, hm_c);
+  deep_copy(*d_a, *hm_a);
+  deep_copy(*d_b, *hm_b);
+  deep_copy(*d_c, *hm_c);
   for(int ii = 0; ii < array_size; ++ii)
   {
-    a[ii] = hm_a(ii);
-    b[ii] = hm_b(ii);
-    c[ii] = hm_c(ii);
+    a[ii] = (*hm_a)(ii);
+    b[ii] = (*hm_b)(ii);
+    c[ii] = (*hm_c)(ii);
   }
 }
 
 template <class T>
 void KOKKOSStream<T>::copy()
 {
-  Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
   {
-    d_c[index] = d_a[index];
+    c[index] = a[index];
   });
 }
 
 template <class T>
 void KOKKOSStream<T>::mul()
 {
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
   const T scalar = 3.0;
   parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
   {
-    d_b[index] = scalar*d_c[index];
+    b[index] = scalar*c[index];
   });
 }
 
 template <class T>
 void KOKKOSStream<T>::add()
 {
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
   parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
   {
-    d_c[index] = d_a[index] + d_b[index];
+    c[index] = a[index] + b[index];
   });
 }
 
 template <class T>
 void KOKKOSStream<T>::triad()
 {
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
   const T scalar = 3.0;
   parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
   {
-    d_a[index] = d_b[index] + scalar*d_c[index];
+    a[index] = b[index] + scalar*c[index];
   });
 }
 
@@ -118,6 +134,6 @@ std::string getDeviceDriver(const int device)
   return "Kokkos";
 }
 
-template class KOKKOSStream<float>;
+//template class KOKKOSStream<float>;
 template class KOKKOSStream<double>;
 
diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp
index 632ca20..d2b9665 100644
--- a/KOKKOSStream.hpp
+++ b/KOKKOSStream.hpp
@@ -17,8 +17,11 @@
 
 #define IMPLEMENTATION_STRING "KOKKOS"
 
-#define DEVICE Kokkos::OpenMP
-
+#ifdef KOKKOS_TARGET_CPU
+  #define DEVICE Kokkos::OpenMP
+#else
+  #define DEVICE Kokkos::Cuda
+#endif
 
 template <class T>
 class KOKKOSStream : public Stream<T>
@@ -28,12 +31,12 @@ class KOKKOSStream : public Stream<T>
     unsigned int array_size;
 
     // Device side pointers to arrays
-    Kokkos::View<double*, DEVICE> d_a;
-    Kokkos::View<double*, DEVICE> d_b;
-    Kokkos::View<double*, DEVICE> d_c;
-    Kokkos::View<double*>::HostMirror hm_a;
-    Kokkos::View<double*>::HostMirror hm_b;
-    Kokkos::View<double*>::HostMirror hm_c;
+    Kokkos::View<double*, DEVICE>* d_a;
+    Kokkos::View<double*, DEVICE>* d_b;
+    Kokkos::View<double*, DEVICE>* d_c;
+    Kokkos::View<double*>::HostMirror* hm_a;
+    Kokkos::View<double*>::HostMirror* hm_b;
+    Kokkos::View<double*>::HostMirror* hm_c;
 
   public:
 
diff --git a/main.cpp b/main.cpp
index 4794f9b..007ab7f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -59,9 +59,11 @@ int main(int argc, char *argv[])
 
   // TODO: Fix SYCL to allow multiple template specializations
 #ifndef SYCL
+#ifndef KOKKOS
   if (use_float)
     run<float>();
   else
+#endif
 #endif
     run<double>();
 

From 0f0454ec295b3c9fe2ada150534fb5d2e332ccce Mon Sep 17 00:00:00 2001
From: Matthew Martineau <m.martineau@hotmail.co.uk>
Date: Fri, 6 May 2016 21:05:20 +0100
Subject: [PATCH 075/117] Added CUDA device syncs to force proper timing

---
 KOKKOSStream.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index 3834081..0c3f44c 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -74,6 +74,7 @@ void KOKKOSStream<T>::copy()
   {
     c[index] = a[index];
   });
+  cudaDeviceSynchronize();
 }
 
 template <class T>
@@ -88,6 +89,7 @@ void KOKKOSStream<T>::mul()
   {
     b[index] = scalar*c[index];
   });
+  cudaDeviceSynchronize();
 }
 
 template <class T>
@@ -101,6 +103,8 @@ void KOKKOSStream<T>::add()
   {
     c[index] = a[index] + b[index];
   });
+
+  cudaDeviceSynchronize();
 }
 
 template <class T>
@@ -115,6 +119,8 @@ void KOKKOSStream<T>::triad()
   {
     a[index] = b[index] + scalar*c[index];
   });
+
+  cudaDeviceSynchronize();
 }
 
 void listDevices(void)

From 6e9b85bb26221114c18f53606aed5ae6fd387c0a Mon Sep 17 00:00:00 2001
From: Matthew Martineau <m.martineau@hotmail.co.uk>
Date: Fri, 6 May 2016 21:08:23 +0100
Subject: [PATCH 076/117] Fixed deep copy ordering, which was reversed

---
 KOKKOSStream.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index 0c3f44c..d93b6d7 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -43,18 +43,18 @@ void KOKKOSStream<T>::write_arrays(
     (*hm_b)(ii) = b[ii];
     (*hm_c)(ii) = c[ii];
   }
-  deep_copy(*hm_a, *d_a);
-  deep_copy(*hm_b, *d_b);
-  deep_copy(*hm_c, *d_c);
+  deep_copy(*d_a, *hm_a);
+  deep_copy(*d_b, *hm_b);
+  deep_copy(*d_c, *hm_c);
 }
 
 template <class T>
 void KOKKOSStream<T>::read_arrays(
         std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
-  deep_copy(*d_a, *hm_a);
-  deep_copy(*d_b, *hm_b);
-  deep_copy(*d_c, *hm_c);
+  deep_copy(*hm_a, *d_a);
+  deep_copy(*hm_b, *d_b);
+  deep_copy(*hm_c, *d_c);
   for(int ii = 0; ii < array_size; ++ii)
   {
     a[ii] = (*hm_a)(ii);

From fb8f06e683ab995e686e8300b17e55fad57ceb51 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 6 May 2016 22:33:18 +0100
Subject: [PATCH 077/117] [SYCL] Pass -no-serial-memop to compute++ to squelch
 warning

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b078ab5..64bfd2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,7 @@ if (true)
   # ComputeCpp
   # TODO: Sort this out properly!
   add_custom_target(gpu-stream-sycl
-                    COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -O2 -emit-llvm -o SYCLStream.bc -c
+                    COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -o SYCLStream.bc -c
                     COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -O2 -std=c++11 -include SYCLStream.sycl -o SYCLStream.o -c
                     COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp -O2 -std=c++11 SYCLStream.o -include SYCLStream.sycl -lSYCL -lOpenCL -o gpu-stream-sycl -DSYCL
                     )

From 54834e05f4b855451b3f266aafd21420441f17c7 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 6 May 2016 22:41:10 +0100
Subject: [PATCH 078/117] [SYCL] Use nd_range instead of range to specify
 work-group size

---
 SYCLStream.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index 1c4ed17..fc76a82 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -11,6 +11,8 @@
 
 using namespace cl::sycl;
 
+#define WGSIZE 64
+
 template <class T>
 SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
@@ -37,9 +39,9 @@ void SYCLStream<T>::copy()
   {
     auto ka = d_a->template get_access<access::mode::read>(cgh);
     auto kc = d_c->template get_access<access::mode::write>(cgh);
-    cgh.parallel_for<class copy>(range<1>{array_size}, [=](id<1> index)
+    cgh.parallel_for<class copy>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
     {
-      kc[index] = ka[index];
+      kc[item.get_global()] = ka[item.get_global()];
     });
   });
   queue.wait();
@@ -53,9 +55,9 @@ void SYCLStream<T>::mul()
   {
     auto kb = d_b->template get_access<access::mode::write>(cgh);
     auto kc = d_c->template get_access<access::mode::read>(cgh);
-    cgh.parallel_for<class mul>(range<1>{array_size}, [=](id<1> index)
+    cgh.parallel_for<class mul>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
     {
-      kb[index] = scalar * kc[index];
+      kb[item.get_global()] = scalar * kc[item.get_global()];
     });
   });
   queue.wait();
@@ -69,9 +71,9 @@ void SYCLStream<T>::add()
     auto ka = d_a->template get_access<access::mode::read>(cgh);
     auto kb = d_b->template get_access<access::mode::read>(cgh);
     auto kc = d_c->template get_access<access::mode::write>(cgh);
-    cgh.parallel_for<class add>(range<1>{array_size}, [=](id<1> index)
+    cgh.parallel_for<class add>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
     {
-      kc[index] = ka[index] + kb[index];
+      kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()];
     });
   });
   queue.wait();
@@ -86,8 +88,9 @@ void SYCLStream<T>::triad()
     auto ka = d_a->template get_access<access::mode::write>(cgh);
     auto kb = d_b->template get_access<access::mode::read>(cgh);
     auto kc = d_c->template get_access<access::mode::read>(cgh);
-    cgh.parallel_for<class triad>(range<1>{array_size}, [=](id<1> index){
-      ka[index] = kb[index] + scalar * kc[index];
+    cgh.parallel_for<class triad>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
+    {
+      ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()];
     });
   });
   queue.wait();

From 3b3f6dfc260f9fb0928753dd699435ae5f45cc80 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Sun, 8 May 2016 19:22:09 +0100
Subject: [PATCH 079/117] [SYCL] Implement device list/selection functionality

---
 SYCLStream.cpp | 80 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 71 insertions(+), 9 deletions(-)

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index fc76a82..c6947e7 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -13,11 +13,20 @@ using namespace cl::sycl;
 
 #define WGSIZE 64
 
+// Cache list of devices
+bool cached = false;
+std::vector<device> devices;
+void getDeviceList(void);
+
 template <class T>
 SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
   array_size = ARRAY_SIZE;
 
+  // Print out device information
+  std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+
   // Create buffers
   d_a = new buffer<T>(array_size);
   d_b = new buffer<T>(array_size);
@@ -124,25 +133,78 @@ void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vecto
   }
 }
 
+void getDeviceList(void)
+{
+  // Get list of platforms
+  std::vector<platform> platforms = platform::get_platforms();
+
+  // Enumerate devices
+  for (unsigned i = 0; i < platforms.size(); i++)
+  {
+    std::vector<device> plat_devices = platforms[i].get_devices();
+    devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
+  }
+  cached = true;
+}
+
 void listDevices(void)
 {
-  // TODO: Get actual list of devices
-  std::cout << std::endl;
-  std::cout << "Devices:" << std::endl;
-  std::cout << "0: " << "triSYCL" << std::endl;
-  std::cout << std::endl;
+  getDeviceList();
+
+  // Print device names
+  if (devices.size() == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < devices.size(); i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
 }
 
 std::string getDeviceName(const int device)
 {
-  // TODO: Implement properly
-  return "triSYCL";
+  if (!cached)
+    getDeviceList();
+
+  std::string name;
+  cl_device_info info = CL_DEVICE_NAME;
+
+  if (device < devices.size())
+  {
+    name = devices[device].get_info<info::device::name>();
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for name for non-existant device");
+  }
+
+  return name;
 }
 
 std::string getDeviceDriver(const int device)
 {
-  // TODO: Implement properly
-  return "triSCYL";
+  if (!cached)
+    getDeviceList();
+
+  std::string driver;
+
+  if (device < devices.size())
+  {
+    driver = devices[device].get_info<info::device::driver_version>();
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for driver for non-existant device");
+  }
+
+  return driver;
 }
 
 

From 6d913bab4b16c688fbe3c3cf804af4a374810543 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Sun, 8 May 2016 21:35:24 +0100
Subject: [PATCH 080/117] [SYCL] Actually use device_index to select device

---
 SYCLStream.cpp | 27 +++++++++++++++++++--------
 SYCLStream.h   |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index c6947e7..0e00c53 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -21,12 +21,21 @@ void getDeviceList(void);
 template <class T>
 SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
+  if (!cached)
+    getDeviceList();
+
   array_size = ARRAY_SIZE;
 
+  if (device_index >= devices.size())
+    throw std::runtime_error("Invalid device index");
+  device dev = devices[device_index];
+
   // Print out device information
   std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl;
   std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
 
+  queue = new cl::sycl::queue(dev);
+
   // Create buffers
   d_a = new buffer<T>(array_size);
   d_b = new buffer<T>(array_size);
@@ -39,12 +48,14 @@ SYCLStream<T>::~SYCLStream()
   delete d_a;
   delete d_b;
   delete d_c;
+
+  delete queue;
 }
 
 template <class T>
 void SYCLStream<T>::copy()
 {
-  queue.submit([&](handler &cgh)
+  queue->submit([&](handler &cgh)
   {
     auto ka = d_a->template get_access<access::mode::read>(cgh);
     auto kc = d_c->template get_access<access::mode::write>(cgh);
@@ -53,14 +64,14 @@ void SYCLStream<T>::copy()
       kc[item.get_global()] = ka[item.get_global()];
     });
   });
-  queue.wait();
+  queue->wait();
 }
 
 template <class T>
 void SYCLStream<T>::mul()
 {
   const T scalar = 3.0;
-  queue.submit([&](handler &cgh)
+  queue->submit([&](handler &cgh)
   {
     auto kb = d_b->template get_access<access::mode::write>(cgh);
     auto kc = d_c->template get_access<access::mode::read>(cgh);
@@ -69,13 +80,13 @@ void SYCLStream<T>::mul()
       kb[item.get_global()] = scalar * kc[item.get_global()];
     });
   });
-  queue.wait();
+  queue->wait();
 }
 
 template <class T>
 void SYCLStream<T>::add()
 {
-  queue.submit([&](handler &cgh)
+  queue->submit([&](handler &cgh)
   {
     auto ka = d_a->template get_access<access::mode::read>(cgh);
     auto kb = d_b->template get_access<access::mode::read>(cgh);
@@ -85,14 +96,14 @@ void SYCLStream<T>::add()
       kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()];
     });
   });
-  queue.wait();
+  queue->wait();
 }
 
 template <class T>
 void SYCLStream<T>::triad()
 {
   const T scalar = 3.0;
-  queue.submit([&](handler &cgh)
+  queue->submit([&](handler &cgh)
   {
     auto ka = d_a->template get_access<access::mode::write>(cgh);
     auto kb = d_b->template get_access<access::mode::read>(cgh);
@@ -102,7 +113,7 @@ void SYCLStream<T>::triad()
       ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()];
     });
   });
-  queue.wait();
+  queue->wait();
 }
 
 template <class T>
diff --git a/SYCLStream.h b/SYCLStream.h
index f4d79d5..4c0c681 100644
--- a/SYCLStream.h
+++ b/SYCLStream.h
@@ -21,7 +21,7 @@ class SYCLStream : public Stream<T>
     unsigned int array_size;
 
     // SYCL objects
-    cl::sycl::queue queue;
+    cl::sycl::queue *queue;
     cl::sycl::buffer<T> *d_a;
     cl::sycl::buffer<T> *d_b;
     cl::sycl::buffer<T> *d_c;

From 084d7417b9cd961683a5c9aae608366c4421b234 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Mon, 9 May 2016 15:20:11 +0100
Subject: [PATCH 081/117] [SYCL] Remove unneeded cl_device_info line

---
 SYCLStream.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index 0e00c53..e8eee42 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -185,7 +185,6 @@ std::string getDeviceName(const int device)
     getDeviceList();
 
   std::string name;
-  cl_device_info info = CL_DEVICE_NAME;
 
   if (device < devices.size())
   {

From 5c8b07262bb8c1813903846cb4a211c37b6d3dd9 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 11:49:44 +0100
Subject: [PATCH 082/117] Default to 100 iterations to get over any warm up
 times

---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index 007ab7f..4fb35c3 100644
--- a/main.cpp
+++ b/main.cpp
@@ -35,7 +35,7 @@
 #endif
 
 unsigned int ARRAY_SIZE = 52428800;
-unsigned int num_times = 10;
+unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
 bool use_float = false;
 

From 3227e5dbf00377f6c104bd5c8e5d7af3d4820ef0 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 11:52:17 +0100
Subject: [PATCH 083/117] Print out data type for float or double

---
 main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.cpp b/main.cpp
index 4fb35c3..c31df9c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -72,6 +72,11 @@ int main(int argc, char *argv[])
 template <typename T>
 void run()
 {
+  if (sizeof(T) == sizeof(float))
+    std::cout << "Precision: float" << std::endl;
+  else
+    std::cout << "Precision: double" << std::endl;
+
   // Create host vectors
   std::vector<T> a(ARRAY_SIZE, 1.0);
   std::vector<T> b(ARRAY_SIZE, 2.0);

From 75ef78495c6c7d2b872a2e3ea4a60b3e58e4d796 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 11:53:51 +0100
Subject: [PATCH 084/117] Add print out of number of iterations

---
 main.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.cpp b/main.cpp
index c31df9c..3a516de 100644
--- a/main.cpp
+++ b/main.cpp
@@ -72,6 +72,8 @@ int main(int argc, char *argv[])
 template <typename T>
 void run()
 {
+  std::cout << "Running kernels " << num_times << " times" << std::endl;
+
   if (sizeof(T) == sizeof(float))
     std::cout << "Precision: float" << std::endl;
   else

From 0f8f191d0e573d3f453b9d46b4c8363e5bb52659 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 11:55:33 +0100
Subject: [PATCH 085/117] Require number of iterations to be at least 2

---
 main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.cpp b/main.cpp
index 3a516de..43cfb7c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -290,6 +290,11 @@ void parseArguments(int argc, char *argv[])
         std::cerr << "Invalid number of times." << std::endl;
         exit(EXIT_FAILURE);
       }
+      if (num_times < 2)
+      {
+        std::cerr << "Number of times must be 2 or more" << std::endl;
+        exit(EXIT_FAILURE);
+      }
     }
     else if (!std::string("--float").compare(argv[i]))
     {

From 207fd8f784072e137dccd9e3ea8ac435dd9d907c Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 12:04:19 +0100
Subject: [PATCH 086/117] Default to power of two array size

---
 main.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index 43cfb7c..832c657 100644
--- a/main.cpp
+++ b/main.cpp
@@ -34,7 +34,8 @@
 #include "OMP3Stream.h"
 #endif
 
-unsigned int ARRAY_SIZE = 52428800;
+// Default size of 2^26
+unsigned int ARRAY_SIZE = 67108864;
 unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
 bool use_float = false;

From 2462023ed9f8ecdbda1c70074a6b18337693ee51 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 12:21:29 +0100
Subject: [PATCH 087/117] Set thread block size in CUDA with a #define, and
 check that array size is multiple of it

---
 CUDAStream.cu | 19 ++++++++++++++-----
 CUDAStream.h  |  2 +-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/CUDAStream.cu b/CUDAStream.cu
index 3c10e8d..2a28f9c 100644
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@@ -8,6 +8,8 @@
 
 #include "CUDAStream.h"
 
+#define TBSIZE 1024
+
 void check_error(void)
 {
   cudaError_t err = cudaGetLastError();
@@ -22,6 +24,14 @@ template <class T>
 CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
 
+  // The array size must be divisible by TBSIZE for kernel launches
+  if (ARRAY_SIZE % TBSIZE != 0)
+  {
+    std::stringstream ss;
+    ss << "Array size must be a multiple of " << TBSIZE;
+    throw std::runtime_error(ss.str());
+  }
+
   // Set device
   int count;
   cudaGetDeviceCount(&count);
@@ -99,7 +109,7 @@ __global__ void copy_kernel(const T * a, T * c)
 template <class T>
 void CUDAStream<T>::copy()
 {
-  copy_kernel<<<array_size/1024, 1024>>>(d_a, d_c);
+  copy_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
@@ -116,7 +126,7 @@ __global__ void mul_kernel(T * b, const T * c)
 template <class T>
 void CUDAStream<T>::mul()
 {
-  mul_kernel<<<array_size/1024, 1024>>>(d_b, d_c);
+  mul_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_b, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
@@ -132,7 +142,7 @@ __global__ void add_kernel(const T * a, const T * b, T * c)
 template <class T>
 void CUDAStream<T>::add()
 {
-  add_kernel<<<array_size/1024, 1024>>>(d_a, d_b, d_c);
+  add_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
@@ -149,7 +159,7 @@ __global__ void triad_kernel(T * a, const T * b, const T * c)
 template <class T>
 void CUDAStream<T>::triad()
 {
-  triad_kernel<<<array_size/1024, 1024>>>(d_a, d_b, d_c);
+  triad_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
   check_error();
   cudaDeviceSynchronize();
   check_error();
@@ -202,4 +212,3 @@ std::string getDeviceDriver(const int device)
 
 template class CUDAStream<float>;
 template class CUDAStream<double>;
-
diff --git a/CUDAStream.h b/CUDAStream.h
index 9c436d6..6904a86 100644
--- a/CUDAStream.h
+++ b/CUDAStream.h
@@ -9,6 +9,7 @@
 
 #include <iostream>
 #include <stdexcept>
+#include <sstream>
 
 #include "Stream.h"
 
@@ -40,4 +41,3 @@ class CUDAStream : public Stream<T>
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 
 };
-

From 81fa9e19222592d420bc5af847dd7bc440d72ffe Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 12:23:21 +0100
Subject: [PATCH 088/117] Require SYCL array size to be multiple of WGSIZE

---
 SYCLStream.cpp | 8 ++++++++
 SYCLStream.h   | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index e8eee42..4f14590 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -24,6 +24,14 @@ SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
   if (!cached)
     getDeviceList();
 
+  // The array size must be divisible by WGSIZE
+  if (ARRAY_SIZE % WGSIZE != 0)
+  {
+    std::stringstream ss;
+    ss << "Array size must be a multiple of " << WGSIZE;
+    throw std::runtime_error(ss.str());
+  }
+
   array_size = ARRAY_SIZE;
 
   if (device_index >= devices.size())
diff --git a/SYCLStream.h b/SYCLStream.h
index 4c0c681..8bc515d 100644
--- a/SYCLStream.h
+++ b/SYCLStream.h
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <sstream>
+
 #include "Stream.h"
 
 #include "CL/sycl.hpp"

From 1a9225ca9557569e01278c8cafe3c48b6fafebed Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 12:54:12 +0100
Subject: [PATCH 089/117] If building CUDA on Darwin with Xcode 7.3.1 skip
 becuase CUDA doesn't work this version

---
 CMakeLists.txt | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64bfd2e..964668e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,13 +12,21 @@ set(gpu-stream_VERSION_MINOR 0)
 configure_file(common.h.in common.h)
 
 find_package(CUDA 7.0 QUIET)
-if (${CUDA_FOUND})
+set(FLAG True)
+if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
+  execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
+  if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
+    message("Xcode version not supported by CUDA")
+    set(FLAG False)
+  endif ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
+endif ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
+if (${FLAG} AND ${CUDA_FOUND})
   list(APPEND CUDA_NVCC_FLAGS --std=c++11)
   cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
   target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
-else (${CUDA_FOUND})
+else (${FLAG} AND ${CUDA_FOUND})
   message("Skipping CUDA...")
-endif (${CUDA_FOUND})
+endif (${FLAG} AND ${CUDA_FOUND})
 
 find_package(OpenCL QUIET)
 if (${OpenCL_FOUND})

From 8d45e61f6c421d8e8f7514cd10ed7c979d205b1a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 13:20:15 +0100
Subject: [PATCH 090/117] Check for OpenACC support by checking the various
 compiler flags

---
 CMakeLists.txt | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 964668e..cd8988d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,11 +37,20 @@ else (${OpenCL_FOUND})
   message("Skipping OpenCL...")
 endif (${OpenCL_FOUND})
 
-# TODO: Find OpenACC implementations somehow
-if (true)
+# Check compiler supports an OpenACC flag
+include(CheckCXXCompilerFlag)
+message("${CMAKE_CXX_COMPILER_ID}")
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+  CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
+  CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
+  CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
+endif()
+if (OPENACC)
   add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
   target_compile_definitions(gpu-stream-acc PUBLIC ACC)
-  target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11")
+  #target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11")
 else ()
   message("Skipping OpenACC...")
 endif ()

From e0ca56bd67ff7444da9c62ec078bce1c0d123387 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 13:33:01 +0100
Subject: [PATCH 091/117] Set the C++11 flag when using the Cray compiler

---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd8988d..dbf8c64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,13 @@ set(gpu-stream_VERSION_MINOR 0)
 
 configure_file(common.h.in common.h)
 
+# If using the Cray compiler, manually add the C++11 flag because setting the
+# standard through CMake as above doesn't set this flag with Cray
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
+  list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
+endif()
+
+
 find_package(CUDA 7.0 QUIET)
 set(FLAG True)
 if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")

From 8a195b64164c5e80c4c0833cad9361ccb9cb3fed Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 13:35:12 +0100
Subject: [PATCH 092/117] Remove printout of compiler id in cmake

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dbf8c64..a7437e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,6 @@ endif (${OpenCL_FOUND})
 
 # Check compiler supports an OpenACC flag
 include(CheckCXXCompilerFlag)
-message("${CMAKE_CXX_COMPILER_ID}")
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
   CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")

From bf29b02d357842a8cfe15fd50533d6047ab98a8d Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 13:35:24 +0100
Subject: [PATCH 093/117] Add banners in CMakeLists file so easy to spot build
 rules for versions

---
 CMakeLists.txt | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7437e5..c37b270 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,7 +17,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
   list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
 endif()
 
-
+#-------------------------------------------------------------------------------
+# CUDA
+#-------------------------------------------------------------------------------
 find_package(CUDA 7.0 QUIET)
 set(FLAG True)
 if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
@@ -35,6 +37,9 @@ else (${FLAG} AND ${CUDA_FOUND})
   message("Skipping CUDA...")
 endif (${FLAG} AND ${CUDA_FOUND})
 
+#-------------------------------------------------------------------------------
+# OpenCL
+#-------------------------------------------------------------------------------
 find_package(OpenCL QUIET)
 if (${OpenCL_FOUND})
   add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
@@ -44,6 +49,9 @@ else (${OpenCL_FOUND})
   message("Skipping OpenCL...")
 endif (${OpenCL_FOUND})
 
+#-------------------------------------------------------------------------------
+# OpenACC
+#-------------------------------------------------------------------------------
 # Check compiler supports an OpenACC flag
 include(CheckCXXCompilerFlag)
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
@@ -61,6 +69,9 @@ else ()
   message("Skipping OpenACC...")
 endif ()
 
+#-------------------------------------------------------------------------------
+# SYCL
+#-------------------------------------------------------------------------------
 # TODO: Find SYCL implementations somehow
 if (true)
   # ComputeCpp

From e095cb67f8a65135a58d9cfc622738951fc163df Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 13:37:12 +0100
Subject: [PATCH 094/117] Remove ugly CMake endif text in parenthesis

---
 CMakeLists.txt | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c37b270..7196066 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ configure_file(common.h.in common.h)
 # standard through CMake as above doesn't set this flag with Cray
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
   list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
-endif()
+endif ()
 
 #-------------------------------------------------------------------------------
 # CUDA
@@ -27,15 +27,15 @@ if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
   if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
     message("Xcode version not supported by CUDA")
     set(FLAG False)
-  endif ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
-endif ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
+  endif ()
+endif ()
 if (${FLAG} AND ${CUDA_FOUND})
   list(APPEND CUDA_NVCC_FLAGS --std=c++11)
   cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
   target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
-else (${FLAG} AND ${CUDA_FOUND})
+else ()
   message("Skipping CUDA...")
-endif (${FLAG} AND ${CUDA_FOUND})
+endif ()
 
 #-------------------------------------------------------------------------------
 # OpenCL
@@ -45,9 +45,9 @@ if (${OpenCL_FOUND})
   add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
   target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
   target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
-else (${OpenCL_FOUND})
+else ()
   message("Skipping OpenCL...")
-endif (${OpenCL_FOUND})
+endif ()
 
 #-------------------------------------------------------------------------------
 # OpenACC
@@ -60,7 +60,8 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
   CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
   CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
-endif()
+endif ()
+
 if (OPENACC)
   add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
   target_compile_definitions(gpu-stream-acc PUBLIC ACC)

From eb10c716f2b25caec7b3442ce0df98b7c8d596df Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 15:08:08 +0100
Subject: [PATCH 095/117] First attempt at OpenMP 4.5

---
 OMP45Stream.cpp | 135 ++++++++++++++++++++++++++++++++++++++++++++++++
 OMP45Stream.h   |  40 ++++++++++++++
 main.cpp        |   6 +++
 3 files changed, 181 insertions(+)
 create mode 100644 OMP45Stream.cpp
 create mode 100644 OMP45Stream.h

diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp
new file mode 100644
index 0000000..779b217
--- /dev/null
+++ b/OMP45Stream.cpp
@@ -0,0 +1,135 @@
+
+#include "OMP45Stream.h"
+
+template <class T>
+OMP45Stream<T>::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
+{
+  omp_set_default_device(device);
+
+  array_size = ARRAY_SIZE;
+
+  // Set up data region on device
+  this->a = a;
+  this->b = b;
+  this->c = c;
+  #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+OMP45Stream<T>::~OMP45Stream()
+{
+  // End data region on device
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void OMP45Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void OMP45Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void OMP45Stream<T>::copy()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void OMP45Stream<T>::mul()
+{
+  const T scalar = 3.0;
+
+  unsigned int array_size = this->array_size;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void OMP45Stream<T>::add()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void OMP45Stream<T>::triad()
+{
+  const T scalar = 3.0;
+
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+void listDevices(void)
+{
+  // Get number of devices
+  int count = omp_get_num_devices();
+
+  // Print device list
+  if (count == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << "There are " << count << " devices." << std::endl;
+  }
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class OMP45Stream<float>;
+template class OMP45Stream<double>;
+
diff --git a/OMP45Stream.h b/OMP45Stream.h
new file mode 100644
index 0000000..98c9705
--- /dev/null
+++ b/OMP45Stream.h
@@ -0,0 +1,40 @@
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#include <omp.h>
+
+#define IMPLEMENTATION_STRING "OpenMP 4.5"
+
+template <class T>
+class OMP45Stream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    OMP45Stream(const unsigned int, T*, T*, T*, int);
+    ~OMP45Stream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+
+  
+};
+
diff --git a/main.cpp b/main.cpp
index 832c657..2f0d6fc 100644
--- a/main.cpp
+++ b/main.cpp
@@ -32,6 +32,8 @@
 #include "SYCLStream.h"
 #elif defined(OMP3)
 #include "OMP3Stream.h"
+#elif defined(OMP45)
+#include "OMP45Stream.h"
 #endif
 
 // Default size of 2^26
@@ -122,6 +124,10 @@ void run()
   // Use the "reference" OpenMP 3 implementation
   stream = new OMP3Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
 
+#elif defined(OMP45)
+  // Use the "reference" OpenMP 3 implementation
+  stream = new OMP45Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
+
 #endif
 
   stream->write_arrays(a, b, c);

From 4954ef7cf00e0f09065146b82e8bfcbef749a461 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 15:17:06 +0100
Subject: [PATCH 096/117] Add map clauses to OpenMP 4.5 kernels

---
 OMP45Stream.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp
index 779b217..f44f66f 100644
--- a/OMP45Stream.cpp
+++ b/OMP45Stream.cpp
@@ -12,7 +12,7 @@ OMP45Stream<T>::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int
   this->a = a;
   this->b = b;
   this->c = c;
-  #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
+  #pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size])
   {}
 }
 
@@ -54,7 +54,7 @@ void OMP45Stream<T>::copy()
   unsigned int array_size = this->array_size;
   T *a = this->a;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd
+  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size])
   for (int i = 0; i < array_size; i++)
   {
     c[i] = a[i];
@@ -69,7 +69,7 @@ void OMP45Stream<T>::mul()
   unsigned int array_size = this->array_size;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd
+  #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size])
   for (int i = 0; i < array_size; i++)
   {
     b[i] = scalar * c[i];
@@ -83,7 +83,7 @@ void OMP45Stream<T>::add()
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd
+  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
   for (int i = 0; i < array_size; i++)
   {
     c[i] = a[i] + b[i];
@@ -99,7 +99,7 @@ void OMP45Stream<T>::triad()
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
-  #pragma omp target teams distribute parallel for simd
+  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
   for (int i = 0; i < array_size; i++)
   {
     a[i] = b[i] + scalar * c[i];

From 55a858e0c0a0bc0f30fdb222d1f8961cabc009c3 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 15:43:52 +0100
Subject: [PATCH 097/117] Use 2^25 as default size because 2^26 gives too many
 thread blocks for CUDA

---
 main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.cpp b/main.cpp
index 2f0d6fc..80791d6 100644
--- a/main.cpp
+++ b/main.cpp
@@ -36,8 +36,8 @@
 #include "OMP45Stream.h"
 #endif
 
-// Default size of 2^26
-unsigned int ARRAY_SIZE = 67108864;
+// Default size of 2^25
+unsigned int ARRAY_SIZE = 33554432;
 unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
 bool use_float = false;

From 31cb567e211dc21be720d660d1792778b4231572 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 15:51:19 +0100
Subject: [PATCH 098/117] Switch data from 1.0, 2.0 and 3.0 to 0.1, 0.2, and
 0.3 resp.

Using integers for maths gets unstable past 38 interations even
in double precision. Using the original values/10 is safe up to
the default 100 iterations.
---
 ACCStream.cpp    |  5 ++---
 CUDAStream.cu    |  4 ++--
 KOKKOSStream.cpp | 13 ++++++-------
 OCLStream.cpp    |  3 +--
 OMP3Stream.cpp   |  5 ++---
 OMP45Stream.cpp  |  5 ++---
 RAJAStream.cpp   | 13 ++++++-------
 SYCLStream.cpp   |  4 ++--
 main.cpp         | 10 +++++-----
 9 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/ACCStream.cpp b/ACCStream.cpp
index 85bf600..8e3bb32 100644
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@@ -65,7 +65,7 @@ void ACCStream<T>::copy()
 template <class T>
 void ACCStream<T>::mul()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
 
   unsigned int array_size = this->array_size;
   T *b = this->b;
@@ -94,7 +94,7 @@ void ACCStream<T>::add()
 template <class T>
 void ACCStream<T>::triad()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
 
   unsigned int array_size = this->array_size;
   T *a = this->a;
@@ -133,4 +133,3 @@ std::string getDeviceDriver(const int)
 }
 template class ACCStream<float>;
 template class ACCStream<double>;
-
diff --git a/CUDAStream.cu b/CUDAStream.cu
index 2a28f9c..21d36fa 100644
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@@ -118,7 +118,7 @@ void CUDAStream<T>::copy()
 template <typename T>
 __global__ void mul_kernel(T * b, const T * c)
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
   const int i = blockDim.x * blockIdx.x + threadIdx.x;
   b[i] = scalar * c[i];
 }
@@ -151,7 +151,7 @@ void CUDAStream<T>::add()
 template <typename T>
 __global__ void triad_kernel(T * a, const T * b, const T * c)
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
   const int i = blockDim.x * blockIdx.x + threadIdx.x;
   a[i] = b[i] + scalar * c[i];
 }
diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index d93b6d7..a9bfcd9 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
   {
     c[index] = a[index];
   });
@@ -84,8 +84,8 @@ void KOKKOSStream<T>::mul()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  const T scalar = 3.0;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  const T scalar = 0.3;
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
   {
     b[index] = scalar*c[index];
   });
@@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
   {
     c[index] = a[index] + b[index];
   });
@@ -114,8 +114,8 @@ void KOKKOSStream<T>::triad()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  const T scalar = 3.0;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index) 
+  const T scalar = 0.3;
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -142,4 +142,3 @@ std::string getDeviceDriver(const int device)
 
 //template class KOKKOSStream<float>;
 template class KOKKOSStream<double>;
-
diff --git a/OCLStream.cpp b/OCLStream.cpp
index f7c538e..50ad543 100644
--- a/OCLStream.cpp
+++ b/OCLStream.cpp
@@ -14,7 +14,7 @@ void getDeviceList(void);
 
 std::string kernels{R"CLC(
 
-  constant TYPE scalar = 3.0;
+  constant TYPE scalar = 0.3;
 
   kernel void copy(
     global const TYPE * restrict a,
@@ -253,4 +253,3 @@ std::string getDeviceDriver(const int device)
 
 template class OCLStream<float>;
 template class OCLStream<double>;
-
diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp
index 8899cff..78b0124 100644
--- a/OMP3Stream.cpp
+++ b/OMP3Stream.cpp
@@ -56,7 +56,7 @@ void OMP3Stream<T>::copy()
 template <class T>
 void OMP3Stream<T>::mul()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
   #pragma omp parallel for
   for (int i = 0; i < array_size; i++)
   {
@@ -77,7 +77,7 @@ void OMP3Stream<T>::add()
 template <class T>
 void OMP3Stream<T>::triad()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
   #pragma omp parallel for
   for (int i = 0; i < array_size; i++)
   {
@@ -103,4 +103,3 @@ std::string getDeviceDriver(const int)
 
 template class OMP3Stream<float>;
 template class OMP3Stream<double>;
-
diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp
index f44f66f..7f6e66a 100644
--- a/OMP45Stream.cpp
+++ b/OMP45Stream.cpp
@@ -64,7 +64,7 @@ void OMP45Stream<T>::copy()
 template <class T>
 void OMP45Stream<T>::mul()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
 
   unsigned int array_size = this->array_size;
   T *b = this->b;
@@ -93,7 +93,7 @@ void OMP45Stream<T>::add()
 template <class T>
 void OMP45Stream<T>::triad()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
 
   unsigned int array_size = this->array_size;
   T *a = this->a;
@@ -132,4 +132,3 @@ std::string getDeviceDriver(const int)
 }
 template class OMP45Stream<float>;
 template class OMP45Stream<double>;
-
diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index eb98d54..e418f09 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -66,7 +66,7 @@ void RAJAStream<T>::copy()
 {
   T* a = d_a;
   T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     c[index] = a[index];
   });
@@ -77,8 +77,8 @@ void RAJAStream<T>::mul()
 {
   T* b = d_b;
   T* c = d_c;
-  const T scalar = 3.0;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  const T scalar = 0.3;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     b[index] = scalar*c[index];
   });
@@ -90,7 +90,7 @@ void RAJAStream<T>::add()
   T* a = d_a;
   T* b = d_b;
   T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     c[index] = a[index] + b[index];
   });
@@ -102,8 +102,8 @@ void RAJAStream<T>::triad()
   T* a = d_a;
   T* b = d_b;
   T* c = d_c;
-  const T scalar = 3.0;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index) 
+  const T scalar = 0.3;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -128,4 +128,3 @@ std::string getDeviceDriver(const int device)
 
 template class RAJAStream<float>;
 template class RAJAStream<double>;
-
diff --git a/SYCLStream.cpp b/SYCLStream.cpp
index 4f14590..d039d70 100644
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@@ -78,7 +78,7 @@ void SYCLStream<T>::copy()
 template <class T>
 void SYCLStream<T>::mul()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
   queue->submit([&](handler &cgh)
   {
     auto kb = d_b->template get_access<access::mode::write>(cgh);
@@ -110,7 +110,7 @@ void SYCLStream<T>::add()
 template <class T>
 void SYCLStream<T>::triad()
 {
-  const T scalar = 3.0;
+  const T scalar = 0.3;
   queue->submit([&](handler &cgh)
   {
     auto ka = d_a->template get_access<access::mode::write>(cgh);
diff --git a/main.cpp b/main.cpp
index 80791d6..933c9a9 100644
--- a/main.cpp
+++ b/main.cpp
@@ -83,8 +83,8 @@ void run()
     std::cout << "Precision: double" << std::endl;
 
   // Create host vectors
-  std::vector<T> a(ARRAY_SIZE, 1.0);
-  std::vector<T> b(ARRAY_SIZE, 2.0);
+  std::vector<T> a(ARRAY_SIZE, 0.1);
+  std::vector<T> b(ARRAY_SIZE, 0.2);
   std::vector<T> c(ARRAY_SIZE, 0.0);
   std::streamsize ss = std::cout.precision();
   std::cout << std::setprecision(1) << std::fixed
@@ -216,11 +216,11 @@ template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
   // Generate correct solution
-  T goldA = 1.0;
-  T goldB = 2.0;
+  T goldA = 0.1;
+  T goldB = 0.2;
   T goldC = 0.0;
 
-  const T scalar = 3.0;
+  const T scalar = 0.3;
 
   for (unsigned int i = 0; i < ntimes; i++)
   {

From eae8da57ac1216c620c66658ec8e13db4b888ed1 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 15:57:20 +0100
Subject: [PATCH 099/117] Delete commented out C++ flag for OpenACC as no
 longer needed

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7196066..0441b4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,6 @@ endif ()
 if (OPENACC)
   add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
   target_compile_definitions(gpu-stream-acc PUBLIC ACC)
-  #target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11")
 else ()
   message("Skipping OpenACC...")
 endif ()

From 9b2a586e08fd5087d5d84091de76ea0d468f9259 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 15:57:39 +0100
Subject: [PATCH 100/117] Add rule to build OMP4.5 on Cray

---
 CMakeLists.txt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0441b4a..c375072 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,22 @@ else ()
   message("Skipping OpenACC...")
 endif ()
 
+#-------------------------------------------------------------------------------
+# OpenMP 3.0
+#-------------------------------------------------------------------------------
+
+# TODO
+
+#-------------------------------------------------------------------------------
+# OpenMP 4.5
+#-------------------------------------------------------------------------------
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
+    add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
+    target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
+  endif ()
+endif ()
+
 #-------------------------------------------------------------------------------
 # SYCL
 #-------------------------------------------------------------------------------

From 494e89d16bb0b5d42e8cbcce9bab18e34ff28644 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 16:02:34 +0100
Subject: [PATCH 101/117] Add placeholder banners for CMake build systems to
 fix

---
 CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c375072..43328d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,18 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
   endif ()
 endif ()
 
+#-------------------------------------------------------------------------------
+# RAJA
+#-------------------------------------------------------------------------------
+
+# TODO
+
+#-------------------------------------------------------------------------------
+# Kokkos
+#-------------------------------------------------------------------------------
+
+# TODO
+
 #-------------------------------------------------------------------------------
 # SYCL
 #-------------------------------------------------------------------------------

From 9449e0888637ba7e14262d1e11038f4cf4b81f16 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 16:23:14 +0100
Subject: [PATCH 102/117] update readme

---
 README.md | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 5a84b50..fee2ef6 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,30 @@ This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for
 
 Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time.
 
+There are multiple implementations of this benchmark in a variety of programming models.
+Currently implemented are:
+  - OpenCL
+  - CUDA
+  - OpenACC
+  - OpenMP 3 and 4.5
+  - Kokkos
+  - RAJA
+  - SYCL
+
 Usage
 -----
 
-Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5)
+CMake 3.2 or above is required.
+Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
 
-Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda`
+Generate the Makefile with `cmake .`
 
-Android
--------
+Build the various binaries with `make`
+
+This will generate binaries of the form `./gpu-stream-*` where `*` identifies the programming model.
+
+Android (outdated instructions)
+------------------
 
 Assuming you have a recent Android NDK available, you can use the
 toolchain that it provides to build GPU-STREAM. You should first
@@ -48,6 +63,7 @@ Run GPU-STREAM from an adb shell:
     # Use float if device doesn't support double, and reduce array size
     ./gpu-stream-ocl --float -n 6 -s 10000000
 
+
 Results
 -------
 

From d420032c662d8580b81be81d247b732c7366aa80 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 17:15:43 +0100
Subject: [PATCH 103/117] Remove warning about iteration count when using
 floats as new data values work for 100 iterations

---
 main.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index 933c9a9..5379bb9 100644
--- a/main.cpp
+++ b/main.cpp
@@ -306,7 +306,6 @@ void parseArguments(int argc, char *argv[])
     else if (!std::string("--float").compare(argv[i]))
     {
       use_float = true;
-      std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl;
     }
     else if (!std::string("--help").compare(argv[i]) ||
              !std::string("-h").compare(argv[i]))

From bf9c6fb6cd373b7f6233deba5e6fea90bf543df3 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 17:21:52 +0100
Subject: [PATCH 104/117] Add -fopenacc flag on linking with GCC compiler

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43328d0..ed048c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,9 @@ endif ()
 include(CheckCXXCompilerFlag)
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
   CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
+  if (OPENACC)
+    list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
+  endif ()
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
   CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")

From 5638cbb2837d234419e2eae0c71f1d2ad1c2d486 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 17:49:48 +0100
Subject: [PATCH 105/117] Check for OpenMP support and build OMP3 version

---
 CMakeLists.txt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed048c5..4bcd688 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,8 +75,13 @@ endif ()
 #-------------------------------------------------------------------------------
 # OpenMP 3.0
 #-------------------------------------------------------------------------------
-
-# TODO
+find_package(OpenMP QUIET)
+if (${OpenMP_FOUND})
+  add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
+  target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
+else ()
+  message("Skipping OpenMP 3...")
+endif ()
 
 #-------------------------------------------------------------------------------
 # OpenMP 4.5

From d4e74a88e9b3c870256d7d1b0c513ceb92286252 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 11 May 2016 17:53:33 +0100
Subject: [PATCH 106/117] Add binary names to gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0fa7243..4d2865c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,10 @@ common.h
 
 gpu-stream-cuda
 gpu-stream-ocl
+gpu-stream-acc
+gpu-stream-omp3
+gpu-stream-omp45
+gpu-stream-sycl
 
 *.o
 *.tar

From 7cd14f480d675d0b53dd38cf88eea3de7961ee49 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Wed, 11 May 2016 22:00:04 +0100
Subject: [PATCH 107/117] [SYCL] Auto-detect presence of CL/sycl.hpp and
 ComputeCpp

---
 CMakeLists.txt | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4bcd688..34847ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 project(gpu-stream)
 
+include(CheckIncludeFileCXX)
+
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 0)
 
@@ -108,20 +110,24 @@ endif ()
 #-------------------------------------------------------------------------------
 # SYCL
 #-------------------------------------------------------------------------------
-# TODO: Find SYCL implementations somehow
-if (true)
-  # ComputeCpp
-  # TODO: Sort this out properly!
-  add_custom_target(gpu-stream-sycl
-                    COMMAND compute++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -o SYCLStream.bc -c
-                    COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -O2 -std=c++11 -include SYCLStream.sycl -o SYCLStream.o -c
-                    COMMAND /usr/bin/c++ ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp -O2 -std=c++11 SYCLStream.o -include SYCLStream.sycl -lSYCL -lOpenCL -o gpu-stream-sycl -DSYCL
-                    )
-
-  # triSYCL
-  #add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
-  #target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
-  #set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
+if (HAS_SYCL)
+  find_program(COMPUTECPP "compute++")
+  if (COMPUTECPP)
+    message(STATUS "Using ComputeCpp for SYCL compilation")
+    add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c)
+    add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
+    add_dependencies(gpu-stream-sycl SYCLStream.sycl)
+    target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
+    target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl)
+    target_link_libraries(gpu-stream-sycl SYCL OpenCL)
+  else()
+    message(STATUS "Using header-only SYCL implementation")
+    add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
+    target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
+    set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
+  endif(COMPUTECPP)
 else ()
   message("Skipping SYCL...")
-endif ()
+endif (HAS_SYCL)

From 3ebad06bd4ef95530b10ea47c0b26f9c43aeceba Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Wed, 11 May 2016 22:22:20 +0100
Subject: [PATCH 108/117] [SYCL] Fix detection of CL/sycl.hpp for C++14
 versions

---
 CMakeLists.txt | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34847ba..8d10c40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 project(gpu-stream)
 
 include(CheckIncludeFileCXX)
+include(CheckCXXCompilerFlag)
 
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 0)
@@ -110,22 +111,31 @@ endif ()
 #-------------------------------------------------------------------------------
 # SYCL
 #-------------------------------------------------------------------------------
-set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
+    "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+  # Use C++14 if available, otherwise drop back to C++11
+  check_cxx_compiler_flag("-std=c++14" CXX14)
+  if (CXX14)
+    set(CMAKE_REQUIRED_FLAGS "-std=c++14")
+  else()
+    set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+  endif()
+endif()
+
 check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
 if (HAS_SYCL)
+  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
+  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
+
   find_program(COMPUTECPP "compute++")
   if (COMPUTECPP)
     message(STATUS "Using ComputeCpp for SYCL compilation")
     add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c)
-    add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
     add_dependencies(gpu-stream-sycl SYCLStream.sycl)
-    target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
     target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl)
     target_link_libraries(gpu-stream-sycl SYCL OpenCL)
   else()
     message(STATUS "Using header-only SYCL implementation")
-    add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
-    target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
     set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
   endif(COMPUTECPP)
 else ()

From f6fca3ac065b0416417829a4ed48db1235bd7b8b Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 12 May 2016 12:30:06 +0100
Subject: [PATCH 109/117] Add Kokkos building to CMake config

---
 CMakeLists.txt | 11 +++++++++--
 KokkosMakefile | 10 ++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 KokkosMakefile

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d10c40..8501add 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,8 +105,15 @@ endif ()
 #-------------------------------------------------------------------------------
 # Kokkos
 #-------------------------------------------------------------------------------
-
-# TODO
+if (BUILD_KOKKOS)
+  if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
+    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile)
+  else()
+    message("Skipping Kokkos (requires Linux)")
+  endif()
+else()
+  message("Skipping Kokkos... (use -DBUILD_KOKKOS=1 to opt in)")
+endif()
 
 #-------------------------------------------------------------------------------
 # SYCL
diff --git a/KokkosMakefile b/KokkosMakefile
new file mode 100644
index 0000000..2d08bc9
--- /dev/null
+++ b/KokkosMakefile
@@ -0,0 +1,10 @@
+
+default: gpu-stream-kokkos
+
+include /modules/modules/kokkos/Makefile.kokkos
+
+gpu-stream-kokkos: main.o KOKKOSStream.o
+        $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+        $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS

From 88d194b75cb7634556b49728a68912be02bdfae7 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 12 May 2016 12:30:35 +0100
Subject: [PATCH 110/117] Use a variable to get Kokkos Path

---
 KokkosMakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/KokkosMakefile b/KokkosMakefile
index 2d08bc9..6e71e67 100644
--- a/KokkosMakefile
+++ b/KokkosMakefile
@@ -1,7 +1,7 @@
 
 default: gpu-stream-kokkos
 
-include /modules/modules/kokkos/Makefile.kokkos
+include $(KOKKOS_PATH)/Makefile.kokkos
 
 gpu-stream-kokkos: main.o KOKKOSStream.o
         $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS

From 2381f059edcf855cf099231e82763dce938b6e52 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 12 May 2016 12:31:16 +0100
Subject: [PATCH 111/117] Set KOKKOS_PATH to build Kokkos version

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8501add..1b46398 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,14 +105,14 @@ endif ()
 #-------------------------------------------------------------------------------
 # Kokkos
 #-------------------------------------------------------------------------------
-if (BUILD_KOKKOS)
+if (KOKKOS_PATH)
   if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
     add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile)
   else()
     message("Skipping Kokkos (requires Linux)")
   endif()
 else()
-  message("Skipping Kokkos... (use -DBUILD_KOKKOS=1 to opt in)")
+  message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
 endif()
 
 #-------------------------------------------------------------------------------

From d75084b753521c11c69b264973c729f84cc9999c Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 12 May 2016 12:35:47 +0100
Subject: [PATCH 112/117] Fix Kokkos CMake so it works..

---
 CMakeLists.txt | 2 +-
 KokkosMakefile | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b46398..515b594 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,7 +107,7 @@ endif ()
 #-------------------------------------------------------------------------------
 if (KOKKOS_PATH)
   if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
-    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile)
+    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
   else()
     message("Skipping Kokkos (requires Linux)")
   endif()
diff --git a/KokkosMakefile b/KokkosMakefile
index 6e71e67..83e00b9 100644
--- a/KokkosMakefile
+++ b/KokkosMakefile
@@ -4,7 +4,8 @@ default: gpu-stream-kokkos
 include $(KOKKOS_PATH)/Makefile.kokkos
 
 gpu-stream-kokkos: main.o KOKKOSStream.o
-        $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS
+	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS
 
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-        $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS
+	$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS
+

From 942188d836c5e5e8cb914689d2dd32ca52e00e8f Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 12 May 2016 12:53:26 +0100
Subject: [PATCH 113/117] Add copyright header to source with it missing

---
 ACCStream.cpp   | 6 ++++++
 ACCStream.h     | 9 +++++++--
 OMP3Stream.cpp  | 6 ++++++
 OMP3Stream.h    | 7 ++++++-
 OMP45Stream.cpp | 6 ++++++
 OMP45Stream.h   | 9 +++++++--
 6 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/ACCStream.cpp b/ACCStream.cpp
index 8e3bb32..48e042e 100644
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #include "ACCStream.h"
 
 template <class T>
diff --git a/ACCStream.h b/ACCStream.h
index d6e5728..48fea55 100644
--- a/ACCStream.h
+++ b/ACCStream.h
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #pragma once
 
 #include <iostream>
@@ -34,6 +40,5 @@ class ACCStream : public Stream<T>
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 
 
-  
-};
 
+};
diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp
index 78b0124..fe8323a 100644
--- a/OMP3Stream.cpp
+++ b/OMP3Stream.cpp
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #include "OMP3Stream.h"
 
 template <class T>
diff --git a/OMP3Stream.h b/OMP3Stream.h
index 15172c3..0f14300 100644
--- a/OMP3Stream.h
+++ b/OMP3Stream.h
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #pragma once
 
 #include <iostream>
@@ -32,4 +38,3 @@ class OMP3Stream : public Stream<T>
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 
 };
-
diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp
index 7f6e66a..f849c39 100644
--- a/OMP45Stream.cpp
+++ b/OMP45Stream.cpp
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #include "OMP45Stream.h"
 
 template <class T>
diff --git a/OMP45Stream.h b/OMP45Stream.h
index 98c9705..bd812a1 100644
--- a/OMP45Stream.h
+++ b/OMP45Stream.h
@@ -1,4 +1,10 @@
 
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
 #pragma once
 
 #include <iostream>
@@ -35,6 +41,5 @@ class OMP45Stream : public Stream<T>
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 
 
-  
-};
 
+};

From 2001ab5fb1a253ad1cb0c2f483b393de0e45b2f6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 12 May 2016 15:40:22 +0100
Subject: [PATCH 114/117] Build against a RAJA installation in the CMake build
 system

---
 CMakeLists.txt | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 515b594..1b2d28d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,8 +99,20 @@ endif ()
 #-------------------------------------------------------------------------------
 # RAJA
 #-------------------------------------------------------------------------------
-
-# TODO
+if (RAJA_PATH)
+  find_package(OpenMP)
+  find_package(CUDA 7.5)
+  list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
+  list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
+  list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
+  list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
+  cuda_include_directories(${RAJA_PATH}/include)
+  set_source_files_properties(main.cpp RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+  cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
+  target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
+else()
+  message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
+endif()
 
 #-------------------------------------------------------------------------------
 # Kokkos

From 2033f3f0c6012fe1445935374ba67a67e2092c7c Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 12 May 2016 16:37:06 +0100
Subject: [PATCH 115/117] Break RAJA build but fix all the others..

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b2d28d..acaa2a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,7 @@ endif ()
 #-------------------------------------------------------------------------------
 # RAJA
 #-------------------------------------------------------------------------------
+
 if (RAJA_PATH)
   find_package(OpenMP)
   find_package(CUDA 7.5)
@@ -107,8 +108,9 @@ if (RAJA_PATH)
   list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
   list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
   cuda_include_directories(${RAJA_PATH}/include)
-  set_source_files_properties(main.cpp RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+  set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
   cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
+  target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
   target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
 else()
   message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")

From 8be4b26bb18cbbba770e17f3814f071e6a5f5e94 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 30 Jun 2016 16:24:32 +0100
Subject: [PATCH 116/117] Add restrict to extra pointers in OpenACC

---
 ACCStream.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ACCStream.cpp b/ACCStream.cpp
index 48e042e..d3fbd6a 100644
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@@ -59,8 +59,8 @@ template <class T>
 void ACCStream<T>::copy()
 {
   unsigned int array_size = this->array_size;
-  T *a = this->a;
-  T *c = this->c;
+  T * restrict a = this->a;
+  T * restrict c = this->c;
   #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait
   for (int i = 0; i < array_size; i++)
   {
@@ -74,8 +74,8 @@ void ACCStream<T>::mul()
   const T scalar = 0.3;
 
   unsigned int array_size = this->array_size;
-  T *b = this->b;
-  T *c = this->c;
+  T * restrict b = this->b;
+  T * restrict c = this->c;
   #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait
   for (int i = 0; i < array_size; i++)
   {
@@ -87,9 +87,9 @@ template <class T>
 void ACCStream<T>::add()
 {
   unsigned int array_size = this->array_size;
-  T *a = this->a;
-  T *b = this->b;
-  T *c = this->c;
+  T * restrict a = this->a;
+  T * restrict b = this->b;
+  T * restrict c = this->c;
   #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
   for (int i = 0; i < array_size; i++)
   {
@@ -103,9 +103,9 @@ void ACCStream<T>::triad()
   const T scalar = 0.3;
 
   unsigned int array_size = this->array_size;
-  T *a = this->a;
-  T *b = this->b;
-  T *c = this->c;
+  T * restrict a = this->a;
+  T * restrict b = this->b;
+  T * restrict c = this->c;
   #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
   for (int i = 0; i < array_size; i++)
   {

From 04f321db8581a2b29a1d5766c70ed98b5011e164 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 30 Jun 2016 16:32:52 +0100
Subject: [PATCH 117/117] Replace cudaDeviceSyncronise with Kokkos::fence()
 function

---
 KOKKOSStream.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index a9bfcd9..d73f7d5 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -74,7 +74,7 @@ void KOKKOSStream<T>::copy()
   {
     c[index] = a[index];
   });
-  cudaDeviceSynchronize();
+  Kokkos::fence();
 }
 
 template <class T>
@@ -89,7 +89,7 @@ void KOKKOSStream<T>::mul()
   {
     b[index] = scalar*c[index];
   });
-  cudaDeviceSynchronize();
+  Kokkos::fence();
 }
 
 template <class T>
@@ -103,8 +103,7 @@ void KOKKOSStream<T>::add()
   {
     c[index] = a[index] + b[index];
   });
-
-  cudaDeviceSynchronize();
+  Kokkos::fence();
 }
 
 template <class T>
@@ -119,8 +118,7 @@ void KOKKOSStream<T>::triad()
   {
     a[index] = b[index] + scalar*c[index];
   });
-
-  cudaDeviceSynchronize();
+  Kokkos::fence();
 }
 
 void listDevices(void)