Merge branch 'refactor'

2016-06-30 16:51:40 +01:00 · 2016-06-30 16:51:40 +01:00 · eda9d56ed9
commit eda9d56ed9
parent b53fb1b3b2 04f321db85
34 changed files with 2469 additions and 2023 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,20 @@

+common.h
+
 gpu-stream-cuda
 gpu-stream-ocl
+gpu-stream-acc
+gpu-stream-omp3
+gpu-stream-omp45
+gpu-stream-sycl
+
 *.o
-
 *.tar
-
 *.gz
+
+.DS_Store
+
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+Makefile
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@ -0,0 +1,141 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "ACCStream.h"
+
+template <class T>
+ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
+{
+
+  acc_set_device_num(device, acc_device_nvidia);
+
+  array_size = ARRAY_SIZE;
+
+  // Set up data region on device
+  this->a = a;
+  this->b = b;
+  this->c = c;
+  #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+ACCStream<T>::~ACCStream()
+{
+  // End data region on device
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void ACCStream<T>::copy()
+{
+  unsigned int array_size = this->array_size;
+  T * restrict a = this->a;
+  T * restrict c = this->c;
+  #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::mul()
+{
+  const T scalar = 0.3;
+
+  unsigned int array_size = this->array_size;
+  T * restrict b = this->b;
+  T * restrict c = this->c;
+  #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::add()
+{
+  unsigned int array_size = this->array_size;
+  T * restrict a = this->a;
+  T * restrict b = this->b;
+  T * restrict c = this->c;
+  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void ACCStream<T>::triad()
+{
+  const T scalar = 0.3;
+
+  unsigned int array_size = this->array_size;
+  T * restrict a = this->a;
+  T * restrict b = this->b;
+  T * restrict c = this->c;
+  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+void listDevices(void)
+{
+  // Get number of devices
+  int count = acc_get_num_devices(acc_device_nvidia);
+
+  // Print device list
+  if (count == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << "There are " << count << " devices." << std::endl;
+  }
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class ACCStream<float>;
+template class ACCStream<double>;
--- a/ACCStream.h
+++ b/ACCStream.h
@ -0,0 +1,44 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#include <openacc.h>
+
+#define IMPLEMENTATION_STRING "OpenACC"
+
+template <class T>
+class ACCStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    ACCStream(const unsigned int, T*, T*, T*, int);
+    ~ACCStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+
+
+};
--- a/CL/cl2.hpp
+++ b/CL/cl2.hpp
@ -52,6 +52,18 @@
 *         #define CL_HPP_USE_DX_INTEROP
 *         cl_khr_sub_groups
 *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
+ *
+ *   Doxygen documentation for this header is available here:
+ *
+ *       http://khronosgroup.github.io/OpenCL-CLHPP/
+ *
+ *   The latest version of this header can be found on the GitHub releases page:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP/releases
+ *
+ *   Bugs and patches can be submitted to the GitHub repository:
+ *
+ *       https://github.com/KhronosGroup/OpenCL-CLHPP
 */

 /*! \mainpage
@ -134,41 +146,64 @@
 *
 * \section parameterization Parameters
 * This header may be parameterized by a set of preprocessor macros.
- * CL_HPP_TARGET_OPENCL_VERSION
- *  - Defines the target OpenCL runtime version to build the header against.
- *    Defaults to 200, representing OpenCL 2.0.
- * CL_HPP_NO_STD_STRING
- *  - Do not use the standard library string class.
- *    cl::string is not defined and may be defined by the user before 
- *    cl2.hpp is included.
- * CL_HPP_NO_STD_VECTOR
- *  - Do not use the standard library vector class.
- *    cl::vector is not defined and may be defined by the user before
- *    cl2.hpp is included. 
- * CL_HPP_NO_STD_ARRAY
- *  - Do not use the standard library array class.
- *    cl::array is not defined and may be defined by the user before
- *    cl2.hpp is included.
- * CL_HPP_NO_STD_UNIQUE_PTR
- *  - Do not use the standard library unique_ptr class.
- *    cl::pointer and the cl::allocate_pointer function are not defined 
- *    and may be defined by the user before cl2.hpp is included.
- * CL_HPP_ENABLE_DEVICE_FISSION 
- *  - Enables device fission for OpenCL 1.2 platforms
- * CL_HPP_ENABLE_EXCEPTIONS
- *  - Enable exceptions for use in the C++ bindings header.
- *    This is the preferred error handling mechanism but is not required.
- * CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
- *  - Backward compatibility option to support cl.hpp-style size_t class.
- *    Replaces the updated std::array derived version and removal of size_t
- *    from the namespace. Note that in this case the new size_t class
- *    is placed in the cl::compatibility namespace and thus requires
- *    an additional using declaration for direct backward compatibility.
- * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 
- *  - Enable older vector of pairs interface for construction of programs.
- * CL_HPP_CL_1_2_DEFAULT_BUILD
- *  - Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
- *  - applies to use of cl::Program construction and other program build variants.
+ *
+ * - CL_HPP_TARGET_OPENCL_VERSION
+ *
+ *   Defines the target OpenCL runtime version to build the header
+ *   against. Defaults to 200, representing OpenCL 2.0.
+ *
+ * - CL_HPP_NO_STD_STRING
+ *
+ *   Do not use the standard library string class. cl::string is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_VECTOR
+ *
+ *   Do not use the standard library vector class. cl::vector is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_ARRAY
+ *
+ *   Do not use the standard library array class. cl::array is not
+ *   defined and may be defined by the user before cl2.hpp is
+ *   included.
+ *
+ * - CL_HPP_NO_STD_UNIQUE_PTR
+ *
+ *   Do not use the standard library unique_ptr class. cl::pointer and
+ *   the cl::allocate_pointer functions are not defined and may be
+ *   defined by the user before cl2.hpp is included.
+ *
+ * - CL_HPP_ENABLE_DEVICE_FISSION
+ *
+ *   Enables device fission for OpenCL 1.2 platforms.
+ *
+ * - CL_HPP_ENABLE_EXCEPTIONS
+ *
+ *   Enable exceptions for use in the C++ bindings header. This is the
+ *   preferred error handling mechanism but is not required.
+ *
+ * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
+ *
+ *   Backward compatibility option to support cl.hpp-style size_t
+ *   class.  Replaces the updated std::array derived version and
+ *   removal of size_t from the namespace. Note that in this case the
+ *   new size_t class is placed in the cl::compatibility namespace and
+ *   thus requires an additional using declaration for direct backward
+ *   compatibility.
+ *
+ * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
+ *
+ *   Enable older vector of pairs interface for construction of
+ *   programs.
+ *
+ * - CL_HPP_CL_1_2_DEFAULT_BUILD
+ *
+ *   Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
+ *   applies to use of cl::Program construction and other program
+ *   build variants.
 *
 *
 * \section example Example
@ -213,34 +248,44 @@
            return -1;
        }

-        std::string kernel1{
-            "global int globalA;"
-            "kernel void updateGlobal(){"
-            "  globalA = 75;"
-            "}"};
-        std::string kernel2{
-            "typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, global int *output, int val, write_only pipe int outPipe, queue_t childQueue){"
-            "  output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);"
-            "  write_pipe(outPipe, &val);"
-            "  queue_t default_queue = get_default_queue(); "
-            "  ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); "
+        // Use C++11 raw string literals for kernel source code
+        std::string kernel1{R"CLC(
+            global int globalA;
+            kernel void updateGlobal()
+            {
+              globalA = 75;
+            }
+        )CLC"};
+        std::string kernel2{R"CLC(
+            typedef struct { global int *bar; } Foo;
+            kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
+                                  global int *output, int val, write_only pipe int outPipe, queue_t childQueue)
+            {
+              output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);
+              write_pipe(outPipe, &val);
+              queue_t default_queue = get_default_queue();
+              ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);
+
              // Have a child kernel write into third quarter of output
-            "  enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
-            "    ^{"
-            "      output[get_global_size(0)*2 + get_global_id(0)] = inputA[get_global_size(0)*2+get_global_id(0)] + inputB[get_global_size(0)*2+get_global_id(0)] + globalA;"
-            "    });"
+              enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*2 + get_global_id(0)] =
+                      inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;
+                });
+
              // Have a child kernel write into last quarter of output
-            "  enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
-            "    ^{"
-            "      output[get_global_size(0)*3 + get_global_id(0)] = inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;"
-            "    });"
-            "}" };
+              enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                ^{
+                    output[get_global_size(0)*3 + get_global_id(0)] =
+                      inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;
+                });
+            }
+        )CLC"};

        // New simpler string interface style
        std::vector<std::string> programStrings {kernel1, kernel2};

-        cl::Program vectorAddProgram(
-            programStrings);
+        cl::Program vectorAddProgram(programStrings);
        try {
            vectorAddProgram.build("-cl-std=CL2.0");
        }
@ -268,9 +313,9 @@
        //////////////////
        // SVM allocations

-        cl::pointer<int> anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
+        auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
        *anSVMInt = 5;
-        cl::SVMAllocator<int, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
+        cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
        auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
        fooPointer->bar = anSVMInt.get();
        cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
@ -3538,7 +3583,7 @@ template <class T, class Alloc, class... Args>
 cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
 {
    Alloc alloc(alloc_);
-    static const size_t copies = 1;
+    static const size_type copies = 1;

    // Ensure that creation of the management block and the
    // object are dealt with separately such that we only provide a deleter
@ -6520,7 +6565,7 @@ inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned

        // Resize the parameter array and constituent arrays
        param->resize(numBinaries);
-        for (int i = 0; i < numBinaries; ++i) {
+        for (size_type i = 0; i < numBinaries; ++i) {
            (*param)[i].resize(sizes[i]);
        }

@ -7107,7 +7152,7 @@ public:
        size_type buffer_slice_pitch,
        size_type host_row_pitch,
        size_type host_slice_pitch,
-        void *ptr,
+        const void *ptr,
        const vector<Event>* events = NULL,
        Event* event = NULL) const
    {
@ -7255,7 +7300,7 @@ public:
        const array<size_type, 3>& region,
        size_type row_pitch,
        size_type slice_pitch,
-        void* ptr,
+        const void* ptr,
        const vector<Event>* events = NULL,
        Event* event = NULL) const
    {
@ -7845,7 +7890,7 @@ public:
    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
        const Kernel& kernel,
        const vector<Event>* events = NULL,
-        Event* event = NULL) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED const
+        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
@ -8873,7 +8918,7 @@ inline cl_int enqueueWriteBufferRect(
    size_type buffer_slice_pitch,
    size_type host_row_pitch,
    size_type host_slice_pitch,
-    void *ptr,
+    const void *ptr,
    const vector<Event>* events = NULL,
    Event* event = NULL)
 {
@ -8971,7 +9016,7 @@ inline cl_int enqueueWriteImage(
    const array<size_type, 3>& region,
    size_type row_pitch,
    size_type slice_pitch,
-    void* ptr,
+    const void* ptr,
    const vector<Event>* events = NULL,
    Event* event = NULL)
 {
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,164 @@
+
+cmake_minimum_required(VERSION 3.2)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+project(gpu-stream)
+
+include(CheckIncludeFileCXX)
+include(CheckCXXCompilerFlag)
+
+set(gpu-stream_VERSION_MAJOR 2)
+set(gpu-stream_VERSION_MINOR 0)
+
+configure_file(common.h.in common.h)
+
+# If using the Cray compiler, manually add the C++11 flag because setting the
+# standard through CMake as above doesn't set this flag with Cray
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
+  list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
+endif ()
+
+#-------------------------------------------------------------------------------
+# CUDA
+#-------------------------------------------------------------------------------
+find_package(CUDA 7.0 QUIET)
+set(FLAG True)
+if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
+  execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
+  if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
+    message("Xcode version not supported by CUDA")
+    set(FLAG False)
+  endif ()
+endif ()
+if (${FLAG} AND ${CUDA_FOUND})
+  list(APPEND CUDA_NVCC_FLAGS --std=c++11)
+  cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
+  target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
+else ()
+  message("Skipping CUDA...")
+endif ()
+
+#-------------------------------------------------------------------------------
+# OpenCL
+#-------------------------------------------------------------------------------
+find_package(OpenCL QUIET)
+if (${OpenCL_FOUND})
+  add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
+  target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
+  target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
+else ()
+  message("Skipping OpenCL...")
+endif ()
+
+#-------------------------------------------------------------------------------
+# OpenACC
+#-------------------------------------------------------------------------------
+# Check compiler supports an OpenACC flag
+include(CheckCXXCompilerFlag)
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+  CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
+  if (OPENACC)
+    list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
+  endif ()
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
+  CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
+elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
+  CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
+endif ()
+
+if (OPENACC)
+  add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
+  target_compile_definitions(gpu-stream-acc PUBLIC ACC)
+else ()
+  message("Skipping OpenACC...")
+endif ()
+
+#-------------------------------------------------------------------------------
+# OpenMP 3.0
+#-------------------------------------------------------------------------------
+find_package(OpenMP QUIET)
+if (${OpenMP_FOUND})
+  add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
+  target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
+else ()
+  message("Skipping OpenMP 3...")
+endif ()
+
+#-------------------------------------------------------------------------------
+# OpenMP 4.5
+#-------------------------------------------------------------------------------
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
+    add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
+    target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
+  endif ()
+endif ()
+
+#-------------------------------------------------------------------------------
+# RAJA
+#-------------------------------------------------------------------------------
+
+if (RAJA_PATH)
+  find_package(OpenMP)
+  find_package(CUDA 7.5)
+  list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
+  list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
+  list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
+  list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
+  cuda_include_directories(${RAJA_PATH}/include)
+  set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+  cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
+  target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
+  target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
+else()
+  message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
+endif()
+
+#-------------------------------------------------------------------------------
+# Kokkos
+#-------------------------------------------------------------------------------
+if (KOKKOS_PATH)
+  if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
+    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
+  else()
+    message("Skipping Kokkos (requires Linux)")
+  endif()
+else()
+  message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
+endif()
+
+#-------------------------------------------------------------------------------
+# SYCL
+#-------------------------------------------------------------------------------
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
+    "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+  # Use C++14 if available, otherwise drop back to C++11
+  check_cxx_compiler_flag("-std=c++14" CXX14)
+  if (CXX14)
+    set(CMAKE_REQUIRED_FLAGS "-std=c++14")
+  else()
+    set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+  endif()
+endif()
+
+check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
+if (HAS_SYCL)
+  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
+  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
+
+  find_program(COMPUTECPP "compute++")
+  if (COMPUTECPP)
+    message(STATUS "Using ComputeCpp for SYCL compilation")
+    add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c)
+    add_dependencies(gpu-stream-sycl SYCLStream.sycl)
+    target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl)
+    target_link_libraries(gpu-stream-sycl SYCL OpenCL)
+  else()
+    message(STATUS "Using header-only SYCL implementation")
+    set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
+  endif(COMPUTECPP)
+else ()
+  message("Skipping SYCL...")
+endif (HAS_SYCL)
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@ -0,0 +1,214 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+
+#include "CUDAStream.h"
+
+#define TBSIZE 1024
+
+void check_error(void)
+{
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    std::cerr << "Error: " << cudaGetErrorString(err) << std::endl;
+    exit(err);
+  }
+}
+
+template <class T>
+CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index)
+{
+
+  // The array size must be divisible by TBSIZE for kernel launches
+  if (ARRAY_SIZE % TBSIZE != 0)
+  {
+    std::stringstream ss;
+    ss << "Array size must be a multiple of " << TBSIZE;
+    throw std::runtime_error(ss.str());
+  }
+
+  // Set device
+  int count;
+  cudaGetDeviceCount(&count);
+  check_error();
+  if (device_index >= count)
+    throw std::runtime_error("Invalid device index");
+  cudaSetDevice(device_index);
+  check_error();
+
+  // Print out device information
+  std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+
+  array_size = ARRAY_SIZE;
+
+  // Check buffers fit on the device
+  cudaDeviceProp props;
+  cudaGetDeviceProperties(&props, 0);
+  if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
+    throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
+  // Create device buffers
+  cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
+  check_error();
+  cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T));
+  check_error();
+  cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T));
+  check_error();
+}
+
+
+template <class T>
+CUDAStream<T>::~CUDAStream()
+{
+  cudaFree(d_a);
+  check_error();
+  cudaFree(d_b);
+  check_error();
+  cudaFree(d_c);
+  check_error();
+}
+
+template <class T>
+void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  // Copy host memory to device
+  cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice);
+  check_error();
+  cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice);
+  check_error();
+  cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
+  check_error();
+}
+
+template <class T>
+void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  // Copy device memory to host
+  cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost);
+  check_error();
+  cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost);
+  check_error();
+  cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost);
+  check_error();
+}
+
+
+template <typename T>
+__global__ void copy_kernel(const T * a, T * c)
+{
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  c[i] = a[i];
+}
+
+template <class T>
+void CUDAStream<T>::copy()
+{
+  copy_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
+}
+
+template <typename T>
+__global__ void mul_kernel(T * b, const T * c)
+{
+  const T scalar = 0.3;
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  b[i] = scalar * c[i];
+}
+
+template <class T>
+void CUDAStream<T>::mul()
+{
+  mul_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_b, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
+}
+
+template <typename T>
+__global__ void add_kernel(const T * a, const T * b, T * c)
+{
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  c[i] = a[i] + b[i];
+}
+
+template <class T>
+void CUDAStream<T>::add()
+{
+  add_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
+}
+
+template <typename T>
+__global__ void triad_kernel(T * a, const T * b, const T * c)
+{
+  const T scalar = 0.3;
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  a[i] = b[i] + scalar * c[i];
+}
+
+template <class T>
+void CUDAStream<T>::triad()
+{
+  triad_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
+  check_error();
+  cudaDeviceSynchronize();
+  check_error();
+}
+
+
+void listDevices(void)
+{
+  // Get number of devices
+  int count;
+  cudaGetDeviceCount(&count);
+  check_error();
+
+  // Print device names
+  if (count == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < count; i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+
+std::string getDeviceName(const int device)
+{
+  cudaDeviceProp props;
+  cudaGetDeviceProperties(&props, device);
+  check_error();
+  return std::string(props.name);
+}
+
+
+std::string getDeviceDriver(const int device)
+{
+  cudaSetDevice(device);
+  check_error();
+  int driver;
+  cudaDriverGetVersion(&driver);
+  check_error();
+  return std::to_string(driver);
+}
+
+template class CUDAStream<float>;
+template class CUDAStream<double>;
--- a/CUDAStream.h
+++ b/CUDAStream.h
@ -0,0 +1,43 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include <sstream>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "CUDA"
+
+template <class T>
+class CUDAStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+    // Device side pointers to arrays
+    T *d_a;
+    T *d_b;
+    T *d_c;
+
+
+  public:
+
+    CUDAStream(const unsigned int, const int);
+    ~CUDAStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@ -0,0 +1,142 @@
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+
+#include "KOKKOSStream.hpp"
+
+using namespace Kokkos;
+
+template <class T>
+KOKKOSStream<T>::KOKKOSStream(
+        const unsigned int ARRAY_SIZE, const int device_index)
+    : array_size(ARRAY_SIZE)
+{
+  Kokkos::initialize();
+
+  d_a = new View<double*, DEVICE>("d_a", ARRAY_SIZE);
+  d_b = new View<double*, DEVICE>("d_b", ARRAY_SIZE);
+  d_c = new View<double*, DEVICE>("d_c", ARRAY_SIZE);
+  hm_a = new View<double*, DEVICE>::HostMirror();
+  hm_b = new View<double*, DEVICE>::HostMirror();
+  hm_c = new View<double*, DEVICE>::HostMirror();
+  *hm_a = create_mirror_view(*d_a);
+  *hm_b = create_mirror_view(*d_b);
+  *hm_c = create_mirror_view(*d_c);
+}
+
+template <class T>
+KOKKOSStream<T>::~KOKKOSStream()
+{
+  finalize();
+}
+
+template <class T>
+void KOKKOSStream<T>::write_arrays(
+        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  for(int ii = 0; ii < array_size; ++ii)
+  {
+    (*hm_a)(ii) = a[ii];
+    (*hm_b)(ii) = b[ii];
+    (*hm_c)(ii) = c[ii];
+  }
+  deep_copy(*d_a, *hm_a);
+  deep_copy(*d_b, *hm_b);
+  deep_copy(*d_c, *hm_c);
+}
+
+template <class T>
+void KOKKOSStream<T>::read_arrays(
+        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  deep_copy(*hm_a, *d_a);
+  deep_copy(*hm_b, *d_b);
+  deep_copy(*hm_c, *d_c);
+  for(int ii = 0; ii < array_size; ++ii)
+  {
+    a[ii] = (*hm_a)(ii);
+    b[ii] = (*hm_b)(ii);
+    c[ii] = (*hm_c)(ii);
+  }
+}
+
+template <class T>
+void KOKKOSStream<T>::copy()
+{
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  {
+    c[index] = a[index];
+  });
+  Kokkos::fence();
+}
+
+template <class T>
+void KOKKOSStream<T>::mul()
+{
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
+  const T scalar = 0.3;
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  {
+    b[index] = scalar*c[index];
+  });
+  Kokkos::fence();
+}
+
+template <class T>
+void KOKKOSStream<T>::add()
+{
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  {
+    c[index] = a[index] + b[index];
+  });
+  Kokkos::fence();
+}
+
+template <class T>
+void KOKKOSStream<T>::triad()
+{
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+
+  const T scalar = 0.3;
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  {
+    a[index] = b[index] + scalar*c[index];
+  });
+  Kokkos::fence();
+}
+
+void listDevices(void)
+{
+  std::cout << "This is not the device you are looking for.";
+}
+
+
+std::string getDeviceName(const int device)
+{
+  return "Kokkos";
+}
+
+
+std::string getDeviceDriver(const int device)
+{
+  return "Kokkos";
+}
+
+//template class KOKKOSStream<float>;
+template class KOKKOSStream<double>;
--- a/KOKKOSStream.hpp
+++ b/KOKKOSStream.hpp
@ -0,0 +1,56 @@
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_View.hpp>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "KOKKOS"
+
+#ifdef KOKKOS_TARGET_CPU
+  #define DEVICE Kokkos::OpenMP
+#else
+  #define DEVICE Kokkos::Cuda
+#endif
+
+template <class T>
+class KOKKOSStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Device side pointers to arrays
+    Kokkos::View<double*, DEVICE>* d_a;
+    Kokkos::View<double*, DEVICE>* d_b;
+    Kokkos::View<double*, DEVICE>* d_c;
+    Kokkos::View<double*>::HostMirror* hm_a;
+    Kokkos::View<double*>::HostMirror* hm_b;
+    Kokkos::View<double*>::HostMirror* hm_c;
+
+  public:
+
+    KOKKOSStream(const unsigned int, const int);
+    ~KOKKOSStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(
+            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(
+            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+};
+
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+
+default: gpu-stream-kokkos
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+gpu-stream-kokkos: main.o KOKKOSStream.o
+	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS
+
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 *==============================================================================
 *------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
+* Copyright 2015-16: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
--- a/41
+++ b/41
@ -1,41 +0,0 @@
-LDLIBS = -l OpenCL
-CXXFLAGS = -std=c++11 -O3
-
-PLATFORM = $(shell uname -s)
-ifeq ($(PLATFORM), Darwin)
-	LDLIBS = -framework OpenCL
-endif
-
-all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip
-
-
-gpu-stream-ocl: ocl-stream.cpp common.o Makefile
-	$(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS)
-
-common.o: common.cpp common.h Makefile
-
-gpu-stream-cuda: cuda-stream.cu common.o Makefile
-ifeq ($(shell which nvcc > /dev/null; echo $$?), 0)
-	nvcc $(CXXFLAGS) common.o $< -o $@
-else
-	$(error "Cannot find nvcc, please install CUDA toolkit")
-endif
-HIP_PATH?=../../..
-HIPCC=$(HIP_PATH)/bin/hipcc
-
-hip-stream.o : hip-stream.cpp
-	$(HIPCC) $(CXXFLAGS) -c $< -o $@
-
-gpu-stream-hip: hip-stream.o common.o Makefile
-ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
-	$(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@
-else
-	$(error "Cannot find $(HIPCC), please install HIP toolkit")
-endif
-
-
-.PHONY: clean
-
-clean:
-	rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o
-
--- a/OCLStream.cpp
+++ b/OCLStream.cpp
@ -0,0 +1,255 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "OCLStream.h"
+
+// Cache list of devices
+bool cached = false;
+std::vector<cl::Device> devices;
+void getDeviceList(void);
+
+std::string kernels{R"CLC(
+
+  constant TYPE scalar = 0.3;
+
+  kernel void copy(
+    global const TYPE * restrict a,
+    global TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    c[i] = a[i];
+  }
+
+  kernel void mul(
+    global TYPE * restrict b,
+    global const TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    b[i] = scalar * c[i];
+  }
+
+  kernel void add(
+    global const TYPE * restrict a,
+    global const TYPE * restrict b,
+    global TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    c[i] = a[i] + b[i];
+  }
+
+  kernel void triad(
+    global TYPE * restrict a,
+    global const TYPE * restrict b,
+    global const TYPE * restrict c)
+  {
+    const size_t i = get_global_id(0);
+    a[i] = b[i] + scalar * c[i];
+  }
+
+)CLC"};
+
+
+template <class T>
+OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
+{
+  if (!cached)
+    getDeviceList();
+
+  // Setup default OpenCL GPU
+  if (device_index >= devices.size())
+    throw std::runtime_error("Invalid device index");
+  device = devices[device_index];
+
+  // Print out device information
+  std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+
+  context = cl::Context(device);
+  queue = cl::CommandQueue(context);
+
+  // Create program
+  cl::Program program(context, kernels);
+  if (sizeof(T) == sizeof(double))
+  {
+    // Check device can do double
+    if (!device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
+      throw std::runtime_error("Device does not support double precision, please use --float");
+    program.build("-DTYPE=double");
+  }
+  else if (sizeof(T) == sizeof(float))
+    program.build("-DTYPE=float");
+
+  // Create kernels
+  copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
+  mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
+  add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
+  triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
+
+  array_size = ARRAY_SIZE;
+
+  // Check buffers fit on the device
+  cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
+  cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
+  if (maxbuffer < sizeof(T)*ARRAY_SIZE)
+    throw std::runtime_error("Device cannot allocate a buffer big enough");
+  if (totalmem < 3*sizeof(T)*ARRAY_SIZE)
+    throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
+  // Create buffers
+  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
+
+}
+
+template <class T>
+OCLStream<T>::~OCLStream()
+{
+  delete copy_kernel;
+  delete mul_kernel;
+  delete add_kernel;
+  delete triad_kernel;
+}
+
+template <class T>
+void OCLStream<T>::copy()
+{
+  (*copy_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_a, d_c
+  );
+  queue.finish();
+}
+
+template <class T>
+void OCLStream<T>::mul()
+{
+  (*mul_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_b, d_c
+  );
+  queue.finish();
+}
+
+template <class T>
+void OCLStream<T>::add()
+{
+  (*add_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_a, d_b, d_c
+  );
+  queue.finish();
+}
+
+template <class T>
+void OCLStream<T>::triad()
+{
+  (*triad_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_a, d_b, d_c
+  );
+  queue.finish();
+}
+
+template <class T>
+void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  cl::copy(queue, a.begin(), a.end(), d_a);
+  cl::copy(queue, b.begin(), b.end(), d_b);
+  cl::copy(queue, c.begin(), c.end(), d_c);
+}
+
+template <class T>
+void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  cl::copy(queue, d_a, a.begin(), a.end());
+  cl::copy(queue, d_b, b.begin(), b.end());
+  cl::copy(queue, d_c, c.begin(), c.end());
+}
+
+void getDeviceList(void)
+{
+  // Get list of platforms
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+
+  // Enumerate devices
+  for (unsigned i = 0; i < platforms.size(); i++)
+  {
+    std::vector<cl::Device> plat_devices;
+    platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
+    devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
+  }
+  cached = true;
+}
+
+void listDevices(void)
+{
+  getDeviceList();
+
+  // Print device names
+  if (devices.size() == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < devices.size(); i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+
+
+}
+
+std::string getDeviceName(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string name;
+  cl_device_info info = CL_DEVICE_NAME;
+
+  if (device < devices.size())
+  {
+    devices[device].getInfo(info, &name);
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for name for non-existant device");
+  }
+
+  return name;
+
+}
+
+std::string getDeviceDriver(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string driver;
+
+  if (device < devices.size())
+  {
+    devices[device].getInfo(CL_DRIVER_VERSION, &driver);
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for driver for non-existant device");
+  }
+
+  return driver;
+}
+
+
+template class OCLStream<float>;
+template class OCLStream<double>;
--- a/OCLStream.h
+++ b/OCLStream.h
@ -0,0 +1,61 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+
+#include "CL/cl2.hpp"
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "OpenCL"
+
+template <class T>
+class OCLStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Device side pointers to arrays
+    cl::Buffer d_a;
+    cl::Buffer d_b;
+    cl::Buffer d_c;
+
+    // OpenCL objects
+    cl::Device device;
+    cl::Context context;
+    cl::CommandQueue queue;
+
+    cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
+    cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
+    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
+    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *triad_kernel;
+
+  public:
+
+    OCLStream(const unsigned int, const int);
+    ~OCLStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
+// Populate the devices list
+void getDeviceList(void);
--- a/OMP3Stream.cpp
+++ b/OMP3Stream.cpp
@ -0,0 +1,111 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "OMP3Stream.h"
+
+template <class T>
+OMP3Stream<T>::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
+{
+  array_size = ARRAY_SIZE;
+  this->a = (T*)malloc(sizeof(T)*array_size);
+  this->b = (T*)malloc(sizeof(T)*array_size);
+  this->c = (T*)malloc(sizeof(T)*array_size);
+}
+
+template <class T>
+OMP3Stream<T>::~OMP3Stream()
+{
+  free(a);
+  free(b);
+  free(c);
+}
+
+
+template <class T>
+void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = h_a[i];
+    b[i] = h_b[i];
+    c[i] = h_c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    h_a[i] = a[i];
+    h_b[i] = b[i];
+    h_c[i] = c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::copy()
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::mul()
+{
+  const T scalar = 0.3;
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::add()
+{
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void OMP3Stream<T>::triad()
+{
+  const T scalar = 0.3;
+  #pragma omp parallel for
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+
+void listDevices(void)
+{
+  std::cout << "0: CPU" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+
+
+template class OMP3Stream<float>;
+template class OMP3Stream<double>;
--- a/OMP3Stream.h
+++ b/OMP3Stream.h
@ -0,0 +1,40 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "Reference OpenMP"
+
+template <class T>
+class OMP3Stream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    OMP3Stream(const unsigned int, T*, T*, T*);
+    ~OMP3Stream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
--- a/OMP45Stream.cpp
+++ b/OMP45Stream.cpp
@ -0,0 +1,140 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "OMP45Stream.h"
+
+template <class T>
+OMP45Stream<T>::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
+{
+  omp_set_default_device(device);
+
+  array_size = ARRAY_SIZE;
+
+  // Set up data region on device
+  this->a = a;
+  this->b = b;
+  this->c = c;
+  #pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+OMP45Stream<T>::~OMP45Stream()
+{
+  // End data region on device
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void OMP45Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void OMP45Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size])
+  {}
+}
+
+template <class T>
+void OMP45Stream<T>::copy()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size])
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i];
+  }
+}
+
+template <class T>
+void OMP45Stream<T>::mul()
+{
+  const T scalar = 0.3;
+
+  unsigned int array_size = this->array_size;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size])
+  for (int i = 0; i < array_size; i++)
+  {
+    b[i] = scalar * c[i];
+  }
+}
+
+template <class T>
+void OMP45Stream<T>::add()
+{
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  for (int i = 0; i < array_size; i++)
+  {
+    c[i] = a[i] + b[i];
+  }
+}
+
+template <class T>
+void OMP45Stream<T>::triad()
+{
+  const T scalar = 0.3;
+
+  unsigned int array_size = this->array_size;
+  T *a = this->a;
+  T *b = this->b;
+  T *c = this->c;
+  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = b[i] + scalar * c[i];
+  }
+}
+void listDevices(void)
+{
+  // Get number of devices
+  int count = omp_get_num_devices();
+
+  // Print device list
+  if (count == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << "There are " << count << " devices." << std::endl;
+  }
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class OMP45Stream<float>;
+template class OMP45Stream<double>;
--- a/OMP45Stream.h
+++ b/OMP45Stream.h
@ -0,0 +1,45 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+
+#include "Stream.h"
+
+#include <omp.h>
+
+#define IMPLEMENTATION_STRING "OpenMP 4.5"
+
+template <class T>
+class OMP45Stream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    OMP45Stream(const unsigned int, T*, T*, T*, int);
+    ~OMP45Stream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+
+
+};
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -0,0 +1,130 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "RAJAStream.hpp"
+
+using RAJA::forall;
+using RAJA::RangeSegment;
+
+template <class T>
+RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
+    : array_size(ARRAY_SIZE)
+{
+  RangeSegment seg(0, ARRAY_SIZE);
+  index_set.push_back(seg);
+
+#ifdef RAJA_TARGET_CPU
+  d_a = new T[ARRAY_SIZE];
+  d_b = new T[ARRAY_SIZE];
+  d_c = new T[ARRAY_SIZE];
+#else
+  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
+  cudaDeviceSynchronize();
+#endif
+}
+
+template <class T>
+RAJAStream<T>::~RAJAStream()
+{
+#ifdef RAJA_TARGET_CPU
+  delete[] d_a;
+  delete[] d_b;
+  delete[] d_c;
+#else
+  cudaFree(d_a);
+  cudaFree(d_b);
+  cudaFree(d_c);
+#endif
+}
+
+template <class T>
+void RAJAStream<T>::write_arrays(
+        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  std::copy(a.begin(), a.end(), d_a);
+  std::copy(b.begin(), b.end(), d_b);
+  std::copy(c.begin(), c.end(), d_c);
+}
+
+template <class T>
+void RAJAStream<T>::read_arrays(
+        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  std::copy(d_a, d_a + array_size, a.data());
+  std::copy(d_b, d_b + array_size, b.data());
+  std::copy(d_c, d_c + array_size, c.data());
+}
+
+template <class T>
+void RAJAStream<T>::copy()
+{
+  T* a = d_a;
+  T* c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  {
+    c[index] = a[index];
+  });
+}
+
+template <class T>
+void RAJAStream<T>::mul()
+{
+  T* b = d_b;
+  T* c = d_c;
+  const T scalar = 0.3;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  {
+    b[index] = scalar*c[index];
+  });
+}
+
+template <class T>
+void RAJAStream<T>::add()
+{
+  T* a = d_a;
+  T* b = d_b;
+  T* c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  {
+    c[index] = a[index] + b[index];
+  });
+}
+
+template <class T>
+void RAJAStream<T>::triad()
+{
+  T* a = d_a;
+  T* b = d_b;
+  T* c = d_c;
+  const T scalar = 0.3;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  {
+    a[index] = b[index] + scalar*c[index];
+  });
+}
+
+void listDevices(void)
+{
+  std::cout << "This is not the device you are looking for.";
+}
+
+
+std::string getDeviceName(const int device)
+{
+  return "RAJA";
+}
+
+
+std::string getDeviceDriver(const int device)
+{
+  return "RAJA";
+}
+
+template class RAJAStream<float>;
+template class RAJAStream<double>;
--- a/RAJAStream.hpp
+++ b/RAJAStream.hpp
@ -0,0 +1,58 @@
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include "RAJA/RAJA.hxx"
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "RAJA"
+
+#ifdef RAJA_TARGET_CPU
+typedef RAJA::IndexSet::ExecPolicy<
+        RAJA::seq_segit,
+        RAJA::omp_parallel_for_exec> policy;
+#else
+const size_t block_size = 128;
+typedef RAJA::IndexSet::ExecPolicy<
+        RAJA::seq_segit,
+        RAJA::cuda_exec<block_size>> policy;
+#endif
+
+template <class T>
+class RAJAStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Contains iteration space
+    RAJA::IndexSet index_set;
+
+    // Device side pointers to arrays
+    T* d_a;
+    T* d_b;
+    T* d_c;
+
+  public:
+
+    RAJAStream(const unsigned int, const int);
+    ~RAJAStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(
+            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(
+            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+};
+
--- a/README.md
+++ b/README.md
@ -6,24 +6,26 @@ This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for

 Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time.

+There are multiple implementations of this benchmark in a variety of programming models.
+Currently implemented are:
+  - OpenCL
+  - CUDA
+  - OpenACC
+  - OpenMP 3 and 4.5
+  - Kokkos
+  - RAJA
+  - SYCL
+
 Usage
 -----

-Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5)
+CMake 3.2 or above is required.
+Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.

-Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda`
+Generate the Makefile with `cmake .`

-For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
-http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
-Install the HCC compiler:
-https://bitbucket.org/multicoreware/hcc/wiki/Home
-Install HIP:
-https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
-
-Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
-
-Android
-------
+Android (outdated instructions)
+------------------

 Assuming you have a recent Android NDK available, you can use the
 toolchain that it provides to build GPU-STREAM. You should first
@ -57,6 +59,7 @@ Run GPU-STREAM from an adb shell:
    # Use float if device doesn't support double, and reduce array size
    ./gpu-stream-ocl --float -n 6 -s 10000000

+
 Results
 -------

--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@ -0,0 +1,231 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "SYCLStream.h"
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+#define WGSIZE 64
+
+// Cache list of devices
+bool cached = false;
+std::vector<device> devices;
+void getDeviceList(void);
+
+template <class T>
+SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
+{
+  if (!cached)
+    getDeviceList();
+
+  // The array size must be divisible by WGSIZE
+  if (ARRAY_SIZE % WGSIZE != 0)
+  {
+    std::stringstream ss;
+    ss << "Array size must be a multiple of " << WGSIZE;
+    throw std::runtime_error(ss.str());
+  }
+
+  array_size = ARRAY_SIZE;
+
+  if (device_index >= devices.size())
+    throw std::runtime_error("Invalid device index");
+  device dev = devices[device_index];
+
+  // Print out device information
+  std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl;
+  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
+
+  queue = new cl::sycl::queue(dev);
+
+  // Create buffers
+  d_a = new buffer<T>(array_size);
+  d_b = new buffer<T>(array_size);
+  d_c = new buffer<T>(array_size);
+}
+
+template <class T>
+SYCLStream<T>::~SYCLStream()
+{
+  delete d_a;
+  delete d_b;
+  delete d_c;
+
+  delete queue;
+}
+
+template <class T>
+void SYCLStream<T>::copy()
+{
+  queue->submit([&](handler &cgh)
+  {
+    auto ka = d_a->template get_access<access::mode::read>(cgh);
+    auto kc = d_c->template get_access<access::mode::write>(cgh);
+    cgh.parallel_for<class copy>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
+    {
+      kc[item.get_global()] = ka[item.get_global()];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::mul()
+{
+  const T scalar = 0.3;
+  queue->submit([&](handler &cgh)
+  {
+    auto kb = d_b->template get_access<access::mode::write>(cgh);
+    auto kc = d_c->template get_access<access::mode::read>(cgh);
+    cgh.parallel_for<class mul>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
+    {
+      kb[item.get_global()] = scalar * kc[item.get_global()];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::add()
+{
+  queue->submit([&](handler &cgh)
+  {
+    auto ka = d_a->template get_access<access::mode::read>(cgh);
+    auto kb = d_b->template get_access<access::mode::read>(cgh);
+    auto kc = d_c->template get_access<access::mode::write>(cgh);
+    cgh.parallel_for<class add>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
+    {
+      kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::triad()
+{
+  const T scalar = 0.3;
+  queue->submit([&](handler &cgh)
+  {
+    auto ka = d_a->template get_access<access::mode::write>(cgh);
+    auto kb = d_b->template get_access<access::mode::read>(cgh);
+    auto kc = d_c->template get_access<access::mode::read>(cgh);
+    cgh.parallel_for<class triad>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
+    {
+      ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()];
+    });
+  });
+  queue->wait();
+}
+
+template <class T>
+void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+{
+  auto _a = d_a->template get_access<access::mode::write, access::target::host_buffer>();
+  auto _b = d_b->template get_access<access::mode::write, access::target::host_buffer>();
+  auto _c = d_c->template get_access<access::mode::write, access::target::host_buffer>();
+  for (int i = 0; i < array_size; i++)
+  {
+    _a[i] = a[i];
+    _b[i] = b[i];
+    _c[i] = c[i];
+  }
+}
+
+template <class T>
+void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  auto _a = d_a->template get_access<access::mode::read, access::target::host_buffer>();
+  auto _b = d_b->template get_access<access::mode::read, access::target::host_buffer>();
+  auto _c = d_c->template get_access<access::mode::read, access::target::host_buffer>();
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = _a[i];
+    b[i] = _b[i];
+    c[i] = _c[i];
+  }
+}
+
+void getDeviceList(void)
+{
+  // Get list of platforms
+  std::vector<platform> platforms = platform::get_platforms();
+
+  // Enumerate devices
+  for (unsigned i = 0; i < platforms.size(); i++)
+  {
+    std::vector<device> plat_devices = platforms[i].get_devices();
+    devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
+  }
+  cached = true;
+}
+
+void listDevices(void)
+{
+  getDeviceList();
+
+  // Print device names
+  if (devices.size() == 0)
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < devices.size(); i++)
+    {
+      std::cout << i << ": " << getDeviceName(i) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+std::string getDeviceName(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string name;
+
+  if (device < devices.size())
+  {
+    name = devices[device].get_info<info::device::name>();
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for name for non-existant device");
+  }
+
+  return name;
+}
+
+std::string getDeviceDriver(const int device)
+{
+  if (!cached)
+    getDeviceList();
+
+  std::string driver;
+
+  if (device < devices.size())
+  {
+    driver = devices[device].get_info<info::device::driver_version>();
+  }
+  else
+  {
+    throw std::runtime_error("Error asking for driver for non-existant device");
+  }
+
+  return driver;
+}
+
+
+// TODO: Fix kernel names to allow multiple template specializations
+//template class SYCLStream<float>;
+template class SYCLStream<double>;
--- a/SYCLStream.h
+++ b/SYCLStream.h
@ -0,0 +1,47 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <sstream>
+
+#include "Stream.h"
+
+#include "CL/sycl.hpp"
+
+#define IMPLEMENTATION_STRING "SYCL"
+
+template <class T>
+class SYCLStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // SYCL objects
+    cl::sycl::queue *queue;
+    cl::sycl::buffer<T> *d_a;
+    cl::sycl::buffer<T> *d_b;
+    cl::sycl::buffer<T> *d_c;
+
+  public:
+
+    SYCLStream(const unsigned int, const int);
+    ~SYCLStream();
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
+
+// Populate the devices list
+void getDeviceList(void);
--- a/Stream.h
+++ b/Stream.h
@ -0,0 +1,37 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <vector>
+#include <string>
+
+template <class T>
+class Stream
+{
+  public:
+
+    virtual ~Stream(){}
+
+    // Kernels
+    // These must be blocking calls
+    virtual void copy() = 0;
+    virtual void mul() = 0;
+    virtual void add() = 0;
+    virtual void triad() = 0;
+
+    // Copy memory between host and device
+    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
+
+};
+
+
+// Implementation specific device functions
+void listDevices(void);
+std::string getDeviceName(const int);
+std::string getDeviceDriver(const int);
--- a/common.cpp
+++ b/common.cpp
@ -1,165 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-#include "common.h"
-
-// Default array size 50 * 2^20 (50*8 Mebibytes double precision)
-// Use binary powers of two so divides 1024
-unsigned int ARRAY_SIZE = 52428800;
-size_t   ARRAY_PAD_BYTES  = 0;
-
-unsigned int NTIMES = 10;
-
-bool useFloat = false;
-unsigned int  groups   = 0;
-unsigned int  groupSize   = 1024;
-
-unsigned int deviceIndex = 0;
-
-int parseUInt(const char *str, unsigned int *output)
-{
-    char *next;
-    *output = strtoul(str, &next, 10);
-    return !strlen(next);
-}
-
-int parseSize(const char *str, size_t *output)
-{
-    char *next;
-    *output = strtoull(str, &next, 0);
-	int l = strlen(str);
-	if (l) {
-		char c = str[l-1]; // last char.
-		if ((c == 'k') || (c == 'K')) {
-			*output *= 1024;
-		}
-		if ((c == 'm') || (c == 'M')) {
-			*output *= (1024*1024);
-		}
-
-	}
-    return !strlen(next);
-}
-
-
-void parseArguments(int argc, char *argv[])
-{
-    for (int i = 1; i < argc; i++)
-    {
-        if (!strcmp(argv[i], "--list"))
-        {
-            listDevices();
-            exit(0);
-        }
-        else if (!strcmp(argv[i], "--device"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
-            {
-                std::cout << "Invalid device index" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
-            {
-                std::cout << "Invalid array size" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &NTIMES))
-            {
-                std::cout << "Invalid number of times" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--groups"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &groups))
-            {
-                std::cout << "Invalid group number" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--groupSize"))
-        {
-            if (++i >= argc || !parseUInt(argv[i], &groupSize))
-            {
-                std::cout << "Invalid group size" << std::endl;
-                exit(1);
-            }
-        }
-        else if (!strcmp(argv[i], "--pad"))
-        {
-            if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
-            {
-                std::cout << "Invalid size" << std::endl;
-                exit(1);
-            }
-
-        }
-        else if (!strcmp(argv[i], "--float"))
-        {
-            useFloat = true;
-            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision, not apply to AMD device" << std::endl;
-        }
-        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
-        {
-            std::cout << std::endl;
-            std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl;
-            std::cout << "Options:" << std::endl;
-            std::cout << "  -h  --help               Print the message" << std::endl;
-            std::cout << "      --list               List available devices" << std::endl;
-            std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
-            std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
-            std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
-            std::cout << "      --groups             Set number of groups to launch -  each work-item proceses multiple array items" << std::endl;
-            std::cout << "      --groupSize          Set size of each group (default 1024)" << std::endl;
-            std::cout << "      --pad                Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
-            std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
-            std::cout << std::endl;
-            exit(0);
-        }
-        else
-        {
-            std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')"
-                << std::endl;
-            exit(1);
-        }
-    }
-}
--- a/common.h
+++ b/common.h
@ -1,115 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-#include <iomanip>
-#include <iostream>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <stdexcept>
-
-#define VERSION_STRING "1.0"
-
-extern void parseArguments(int argc, char *argv[]);
-
-extern void listDevices(void);
-
-extern unsigned int ARRAY_SIZE;
-extern size_t       ARRAY_PAD_BYTES;
-extern unsigned int NTIMES;
-
-extern unsigned int  groups;
-extern unsigned int  groupSize;
-extern bool useFloat;
-
-extern unsigned int deviceIndex;
-
-
-template < typename T >
-void check_solution(void* a_in, void* b_in, void* c_in)
-{
-    // Generate correct solution
-    T golda = 1.0;
-    T goldb = 2.0;
-    T goldc = 0.0;
-
-    T * a = static_cast<T*>(a_in);
-    T * b = static_cast<T*>(b_in);
-    T * c = static_cast<T*>(c_in);
-
-    const T scalar = 3.0;
-
-    for (unsigned int i = 0; i < NTIMES; i++)
-    {
-        // Double
-        goldc = golda;
-        goldb = scalar * goldc;
-        goldc = golda + goldb;
-        golda = goldb + scalar * goldc;
-    }
-
-    // Calculate average error
-    double erra = 0.0;
-    double errb = 0.0;
-    double errc = 0.0;
-
-    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
-    {
-        erra += fabs(a[i] - golda);
-        errb += fabs(b[i] - goldb);
-        errc += fabs(c[i] - goldc);
-    }
-
-    erra /= ARRAY_SIZE;
-    errb /= ARRAY_SIZE;
-    errc /= ARRAY_SIZE;
-
-    double epsi = std::numeric_limits<T>::epsilon() * 100;
-
-    if (erra > epsi)
-        std::cout
-            << "Validation failed on a[]. Average error " << erra
-            << std::endl;
-    if (errb > epsi)
-        std::cout
-            << "Validation failed on b[]. Average error " << errb
-            << std::endl;
-    if (errc > epsi)
-        std::cout
-            << "Validation failed on c[]. Average error " << errc
-            << std::endl;
-}
-
--- a/common.h.in
+++ b/common.h.in
@ -0,0 +1,9 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@"
+
--- a/cuda-stream.cu
+++ b/cuda-stream.cu
@ -1,507 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <chrono>
-#include <cfloat>
-#include <cmath>
-
-#include <cuda.h>
-#include "common.h"
-
-std::string getDeviceName(int device);
-int getDriver(void);
-
-// Code to check CUDA errors
-void check_cuda_error(void)
-{
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess)
-    {
-        std::cerr
-            << "Error: "
-            << cudaGetErrorString(err)
-            << std::endl;
-            exit(err);
-    }
-}
-
-// looper function place more work inside each work item.
-// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T>
-__global__ void
-copy_looper(const T * a, T * c, int ARRAY_SIZE)
-{
-    int offset = (blockDim.x * blockIdx.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        c[i] = a[i];
-    }
-}
-
-template <typename T>
-__global__ void
-mul_looper(T * b, const T * c, int ARRAY_SIZE)
-{
-    int offset = blockDim.x * blockIdx.x + threadIdx.x;
-    int stride = blockDim.x * gridDim.x;
-    const T scalar = 3.0;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        b[i] = scalar * c[i];
-    }
-}
-
-template <typename T>
-__global__ void
-add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE)
-{
-    int offset = blockDim.x * blockIdx.x + threadIdx.x;
-    int stride = blockDim.x * gridDim.x;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-template <typename T>
-__global__ void
-triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE)
-{
-    int offset = blockDim.x * blockIdx.x + threadIdx.x;
-    int stride = blockDim.x * gridDim.x;
-    const T scalar = 3.0;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        a[i] = b[i] + scalar * c[i];
-    }
-}
-
-
-template <typename T>
-__global__ void copy(const T * a, T * c)
-{
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    c[i] = a[i];
-}
-
-template <typename T>
-__global__ void mul(T * b, const T * c)
-{
-    const T scalar = 3.0;
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    b[i] = scalar * c[i];
-}
-
-template <typename T>
-__global__ void add(const T * a, const T * b, T * c)
-{
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    c[i] = a[i] + b[i];
-}
-
-template <typename T>
-__global__ void triad(T * a, const T * b, const T * c)
-{
-    const T scalar = 3.0;
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    a[i] = b[i] + scalar * c[i];
-}
-
-int main(int argc, char *argv[])
-{
-
-    // Print out run information
-    std::cout
-        << "GPU-STREAM" << std::endl
-        << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: CUDA" << std::endl;
-
-    parseArguments(argc, argv);
-
-    if (NTIMES < 2)
-        throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
-
-    // Config grid size and group size for kernel launching
-    int gridSize;
-    if (groups) {
-        gridSize = groups * groupSize;
-    } else  {
-        gridSize = ARRAY_SIZE;
-    }
-
-    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
-    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
-    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
-    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
-    if (groups) std::cout << "Using looper kernels:" << std::endl;
-
-    std::cout << "Precision: ";
-    if (useFloat) std::cout << "float";
-    else std::cout << "double";
-    std::cout << std::endl << std::endl;
-
-    std::cout << "Running kernels " << NTIMES << " times" << std::endl;
-
-    if (ARRAY_SIZE % 1024 != 0)
-    {
-        unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
-        ARRAY_SIZE -= ARRAY_SIZE % 1024;
-        std::cout
-            << "Warning: array size must divide 1024" << std::endl
-            << "Resizing array from " << OLD_ARRAY_SIZE
-            << " to " << ARRAY_SIZE << std::endl;
-        if (ARRAY_SIZE == 0)
-            throw std::runtime_error("Array size must be >= 1024");
-    }
-
-    // Get precision (used to reset later)
-    std::streamsize ss = std::cout.precision();
-
-    size_t DATATYPE_SIZE;
-
-    if (useFloat)
-    {
-        DATATYPE_SIZE = sizeof(float);
-    }
-    else
-    {
-        DATATYPE_SIZE = sizeof(double);
-    }
-
-    // Display number of bytes in array
-    std::cout << std::setprecision(1) << std::fixed
-        << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-        << std::endl;
-    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-        << std::endl;
-
-    // Reset precision
-    std::cout.precision(ss);
-
-    // Check device index is in range
-    int count;
-    cudaGetDeviceCount(&count);
-    check_cuda_error();
-    if (deviceIndex >= count)
-        throw std::runtime_error("Chosen device index is invalid");
-    cudaSetDevice(deviceIndex);
-    check_cuda_error();
-
-    // Print out device name
-    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
-
-    // Print out device CUDA driver version
-    std::cout << "Driver: " << getDriver() << std::endl;
-
-    // Check buffers fit on the device
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, deviceIndex);
-    if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
-        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
-
-    // Create host vectors
-    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
-
-    // Initilise arrays
-    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
-    {
-        if (useFloat)
-        {
-            ((float*)h_a)[i] = 1.0f;
-            ((float*)h_b)[i] = 2.0f;
-            ((float*)h_c)[i] = 0.0f;
-        }
-        else
-        {
-            ((double*)h_a)[i] = 1.0;
-            ((double*)h_b)[i] = 2.0;
-            ((double*)h_c)[i] = 0.0;
-        }
-    }
-
-    // Create device buffers
-    void * d_a, * d_b, *d_c;
-    cudaMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
-    check_cuda_error();
-    cudaMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
-    check_cuda_error();
-    cudaMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
-    check_cuda_error();
-
-    // Copy host memory to device
-    cudaMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
-    check_cuda_error();
-    cudaMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
-    check_cuda_error();
-    cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
-    check_cuda_error();
-
-    std::cout << "d_a=" << (void*)d_a << std::endl;
-	std::cout << "d_b=" << (void*)d_b << std::endl;
-	std::cout << "d_c=" << (void*)d_c << std::endl;
-    
-    // Make sure the copies are finished
-    cudaDeviceSynchronize();
-    check_cuda_error();
-
-    // List of times
-    std::vector< std::vector<double> > timings;
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Main loop
-    for (unsigned int k = 0; k < NTIMES; k++)
-    {
-        std::vector<double> times;
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                copy_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
-            else
-                copy_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
-            else
-                copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
-        }
-        
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                mul_looper<float><<<gridSize,groupSize>>>((float*)d_b, (float*)d_c, ARRAY_SIZE);
-            else
-                mul_looper<double><<<gridSize,groupSize>>>((double*)d_b, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
-            else
-                mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
-        }
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                add_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
-            else
-                add_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
-            else
-                add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
-        }
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                triad_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
-            else
-                triad_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
-            else
-                triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
-        }
-        check_cuda_error();
-        cudaDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-        timings.push_back(times);
-
-    }
-
-    // Check solutions
-    cudaMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
-    check_cuda_error();
-    cudaMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
-    check_cuda_error();
-    cudaMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
-    check_cuda_error();
-
-    if (useFloat)
-    {
-        check_solution<float>(h_a, h_b, h_c);
-    }
-    else
-    {
-        check_solution<double>(h_a, h_b, h_c);
-    }
-
-    // Crunch results
-    size_t sizes[4] = {
-        2 * DATATYPE_SIZE * ARRAY_SIZE,
-        2 * DATATYPE_SIZE * ARRAY_SIZE,
-        3 * DATATYPE_SIZE * ARRAY_SIZE,
-        3 * DATATYPE_SIZE * ARRAY_SIZE
-    };
-    double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
-    double max[4] = {0.0, 0.0, 0.0, 0.0};
-    double avg[4] = {0.0, 0.0, 0.0, 0.0};
-
-    // Ignore first result
-    for (unsigned int i = 1; i < NTIMES; i++)
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            avg[j] += timings[i][j];
-            min[j] = std::min(min[j], timings[i][j]);
-            max[j] = std::max(max[j], timings[i][j]);
-        }
-    }
-
-    for (int j = 0; j < 4; j++)
-        avg[j] /= (double)(NTIMES-1);
-
-    double geomean = 1.0;
-    for (int j = 0; j < 4; j++) {
-        geomean *= (sizes[j]/min[j]);
-    }
-    geomean = pow(geomean, 0.25);
-    
-    // Display results
-    std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
-    std::cout
-        << std::left << std::setw(12) << "Function"
-        << std::left << std::setw(12) << "MBytes/sec"
-        << std::left << std::setw(12) << "Min (sec)"
-        << std::left << std::setw(12) << "Max"
-        << std::left << std::setw(12) << "Average"
-        << std::endl;
-
-    for (int j = 0; j < 4; j++)
-    {
-        std::cout
-            << std::left << std::setw(12) << labels[j]
-            << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
-            << std::left << std::setw(12) << std::setprecision(5) << min[j]
-            << std::left << std::setw(12) << std::setprecision(5) << max[j]
-            << std::left << std::setw(12) << std::setprecision(5) << avg[j]
-            << std::endl;
-    }
-    std::cout
-        << std::left << std::setw(12) << "GEOMEAN"
-        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
-        << std::endl;
-
-    // Free host vectors
-    free(h_a);
-    free(h_b);
-    free(h_c);
-
-    // Free cuda buffers
-    cudaFree(d_a);
-    check_cuda_error();
-    cudaFree(d_b);
-    check_cuda_error();
-    cudaFree(d_c);
-    check_cuda_error();
-
-}
-
-std::string getDeviceName(int device)
-{
-    struct cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, device);
-    check_cuda_error();
-    return std::string(prop.name);
-}
-
-int getDriver(void)
-{
-    int driver;
-    cudaDriverGetVersion(&driver);
-    check_cuda_error();
-    return driver;
-}
-
-void listDevices(void)
-{
-    // Get number of devices
-    int count;
-    cudaGetDeviceCount(&count);
-    check_cuda_error();
-
-    // Print device names
-    if (count == 0)
-    {
-        std::cout << "No devices found." << std::endl;
-    }
-    else
-    {
-        std::cout << std::endl;
-        std::cout << "Devices:" << std::endl;
-        for (int i = 0; i < count; i++)
-        {
-            std::cout << i << ": " << getDeviceName(i) << std::endl;
-            check_cuda_error();
-        }
-        std::cout << std::endl;
-    }
-}
-
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@ -1,531 +0,0 @@
-#include "hip_runtime.h"
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <chrono>
-#include <cfloat>
-#include <cmath>
-
-//#include <cuda.h>
-#include "common.h"
-
-std::string getDeviceName(int device);
-int getDriver(void);
-
-// Code to check CUDA errors
-void check_cuda_error(void)
-{
-    hipError_t err = hipGetLastError();
-    if (err != hipSuccess)
-    {
-        std::cerr
-            << "Error: "
-            << hipGetErrorString(err)
-            << std::endl;
-            exit(err);
-    }
-}
-
-
-
-// looper function place more work inside each work item.
-// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
-template <typename T>
-__global__ void
-copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
-{
-    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    int stride = hipBlockDim_x * hipGridDim_x;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        c[i] = a[i];
-    }
-}
-
-template <typename T>
-__global__ void
-mul_looper(hipLaunchParm lp,  T * b, const T * c, int ARRAY_SIZE)
-{
-    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int stride = hipBlockDim_x * hipGridDim_x;
-    const T scalar = 3.0;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        b[i] = scalar * c[i];
-    }
-}
-
-template <typename T>
-__global__ void
-add_looper(hipLaunchParm lp,  const T * a, const T * b, T * c, int ARRAY_SIZE)
-{
-    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int stride = hipBlockDim_x * hipGridDim_x;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-template <typename T>
-__global__ void
-triad_looper(hipLaunchParm lp,  T * a, const T * b, const T * c, int ARRAY_SIZE)
-{
-    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int stride = hipBlockDim_x * hipGridDim_x;
-    const T scalar = 3.0;
-
-    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
-        a[i] = b[i] + scalar * c[i];
-    }
-}
-
-
-
-
-template <typename T>
-__global__ void
-copy(hipLaunchParm lp,  const T * a, T * c)
-{
-    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    c[i] = a[i];
-}
-
-
-template <typename T>
-__global__ void
-mul(hipLaunchParm lp,  T * b, const T * c)
-{
-    const T scalar = 3.0;
-    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    b[i] = scalar * c[i];
-}
-
-template <typename T>
-__global__ void
-add(hipLaunchParm lp,  const T * a, const T * b, T * c)
-{
-    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    c[i] = a[i] + b[i];
-}
-
-template <typename T>
-__global__ void
-triad(hipLaunchParm lp,  T * a, const T * b, const T * c)
-{
-    const T scalar = 3.0;
-    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    a[i] = b[i] + scalar * c[i];
-}
-
-int main(int argc, char *argv[])
-{
-
-    // Print out run information
-    std::cout
-        << "GPU-STREAM" << std::endl
-        << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: HIP" << std::endl;
-
-    parseArguments(argc, argv);
-
-    if (NTIMES < 2)
-        throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
-
-    // Config grid size and group size for kernel launching
-    int gridSize;
-    if (groups) {
-        gridSize = groups * groupSize;
-    } else  {
-        gridSize = ARRAY_SIZE;
-    }
-
-    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
-    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
-    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
-    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
-    if (groups) std::cout << "Using looper kernels:" << std::endl;
-
-    std::cout << "Precision: ";
-    if (useFloat) std::cout << "float";
-    else std::cout << "double";
-    std::cout << std::endl << std::endl;
-
-    std::cout << "Running kernels " << NTIMES << " times" << std::endl;
-
-    if (ARRAY_SIZE % 1024 != 0)
-    {
-        unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
-        ARRAY_SIZE -= ARRAY_SIZE % 1024;
-        std::cout
-            << "Warning: array size must divide 1024" << std::endl
-            << "Resizing array from " << OLD_ARRAY_SIZE
-            << " to " << ARRAY_SIZE << std::endl;
-        if (ARRAY_SIZE == 0)
-            throw std::runtime_error("Array size must be >= 1024");
-    }
-
-    // Get precision (used to reset later)
-    std::streamsize ss = std::cout.precision();
-
-    size_t DATATYPE_SIZE;
-
-    if (useFloat)
-    {
-        DATATYPE_SIZE = sizeof(float);
-    }
-    else
-    {
-        DATATYPE_SIZE = sizeof(double);
-    }
-
-    // Display number of bytes in array
-    std::cout << std::setprecision(1) << std::fixed
-        << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-        << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-		<< " " << ARRAY_PAD_BYTES << " bytes padding"
-        << std::endl;
-    std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
-        << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
-        << std::endl;
-
-    // Reset precision
-    std::cout.precision(ss);
-
-    // Check device index is in range
-    int count;
-    hipGetDeviceCount(&count);
-    check_cuda_error();
-    if (deviceIndex >= count)
-        throw std::runtime_error("Chosen device index is invalid");
-    hipSetDevice(deviceIndex);
-    check_cuda_error();
-
-
-    hipDeviceProp_t props;
-    hipGetDeviceProperties(&props, deviceIndex);
-
-    // Print out device name
-    std::cout << "Using HIP device " << getDeviceName(deviceIndex) <<  " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
-
-    // Print out device HIP driver version
-    std::cout << "Driver: " << getDriver() << std::endl;
-
-
-
-
-    // Check buffers fit on the device
-    if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
-        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
-
-    //int cus = props.multiProcessorCount;
-
-    // Create host vectors
-    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
-    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
-    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
-
-    // Initialise arrays
-    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
-    {
-        if (useFloat)
-        {
-            ((float*)h_a)[i] = 1.0f;
-            ((float*)h_b)[i] = 2.0f;
-            ((float*)h_c)[i] = 0.0f;
-        }
-        else
-        {
-            ((double*)h_a)[i] = 1.0;
-            ((double*)h_b)[i] = 2.0;
-            ((double*)h_c)[i] = 0.0;
-        }
-    }
-
-    // Create device buffers
-    char * d_a, * d_b, *d_c;
-    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
-    check_cuda_error();
-    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
-    d_b += ARRAY_PAD_BYTES;
-    check_cuda_error();
-    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
-    d_c += ARRAY_PAD_BYTES;
-    check_cuda_error();
-
-    // Copy host memory to device
-    hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
-    check_cuda_error();
-    hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
-    check_cuda_error();
-    hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
-    check_cuda_error();
-
-
-    std::cout << "d_a=" << (void*)d_a << std::endl;
-	std::cout << "d_b=" << (void*)d_b << std::endl;
-	std::cout << "d_c=" << (void*)d_c << std::endl;
-
-    // Make sure the copies are finished
-    hipDeviceSynchronize();
-    check_cuda_error();
-
-
-
-    // List of times
-    std::vector< std::vector<double> > timings;
-
-    // Declare timers
-    std::chrono::high_resolution_clock::time_point t1, t2;
-
-    // Main loop
-    for (unsigned int k = 0; k < NTIMES; k++)
-    {
-        std::vector<double> times;
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
-        }
-        check_cuda_error();
-        hipDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
-        }
-        check_cuda_error();
-        hipDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
-        }
-        check_cuda_error();
-        hipDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-        t1 = std::chrono::high_resolution_clock::now();
-        if (groups) {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
-        } else {
-            if (useFloat)
-                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
-            else
-                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
-        }
-
-        check_cuda_error();
-        hipDeviceSynchronize();
-        check_cuda_error();
-        t2 = std::chrono::high_resolution_clock::now();
-        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-        timings.push_back(times);
-
-    }
-
-    // Check solutions
-    hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
-    check_cuda_error();
-    hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
-    check_cuda_error();
-    hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
-    check_cuda_error();
-
-    if (useFloat)
-    {
-        check_solution<float>(h_a, h_b, h_c);
-    }
-    else
-    {
-        check_solution<double>(h_a, h_b, h_c);
-    }
-
-    // Crunch results
-    size_t sizes[4] = {
-        2 * DATATYPE_SIZE * ARRAY_SIZE,
-        2 * DATATYPE_SIZE * ARRAY_SIZE,
-        3 * DATATYPE_SIZE * ARRAY_SIZE,
-        3 * DATATYPE_SIZE * ARRAY_SIZE
-    };
-    double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
-    double max[4] = {0.0, 0.0, 0.0, 0.0};
-    double avg[4] = {0.0, 0.0, 0.0, 0.0};
-
-    // Ignore first result
-    for (unsigned int i = 1; i < NTIMES; i++)
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            avg[j] += timings[i][j];
-            min[j] = std::min(min[j], timings[i][j]);
-            max[j] = std::max(max[j], timings[i][j]);
-        }
-    }
-
-    for (int j = 0; j < 4; j++) {
-        avg[j] /= (double)(NTIMES-1);
-    }
-
-    double geomean = 1.0;
-    for (int j = 0; j < 4; j++) {
-        geomean *= (sizes[j]/min[j]);
-    }
-    geomean = pow(geomean, 0.25);
-
-    // Display results
-    std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
-    std::cout
-        << std::left << std::setw(12) << "Function"
-        << std::left << std::setw(12) << "MBytes/sec"
-        << std::left << std::setw(12) << "Min (sec)"
-        << std::left << std::setw(12) << "Max"
-        << std::left << std::setw(12) << "Average"
-        << std::endl;
-
-    for (int j = 0; j < 4; j++)
-    {
-        std::cout
-            << std::left << std::setw(12) << labels[j]
-            << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
-            << std::left << std::setw(12) << std::setprecision(5) << min[j]
-            << std::left << std::setw(12) << std::setprecision(5) << max[j]
-            << std::left << std::setw(12) << std::setprecision(5) << avg[j]
-            << std::endl;
-    }
-    std::cout
-        << std::left << std::setw(12) << "GEOMEAN"
-        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
-        << std::endl;
-
-    // Free host vectors
-    free(h_a);
-    free(h_b);
-    free(h_c);
-
-    // Free cuda buffers
-    hipFree(d_a);
-    check_cuda_error();
-    hipFree(d_b);
-    check_cuda_error();
-    hipFree(d_c);
-    check_cuda_error();
-
-}
-
-std::string getDeviceName(int device)
-{
-    struct hipDeviceProp_t prop;
-    hipGetDeviceProperties(&prop, device);
-    check_cuda_error();
-    return std::string(prop.name);
-}
-
-int getDriver(void)
-{
-    int driver;
-    hipDriverGetVersion(&driver);
-    check_cuda_error();
-    return driver;
-}
-
-void listDevices(void)
-{
-    // Get number of devices
-    int count;
-    hipGetDeviceCount(&count);
-    check_cuda_error();
-
-    // Print device names
-    if (count == 0)
-    {
-        std::cout << "No devices found." << std::endl;
-    }
-    else
-    {
-        std::cout << std::endl;
-        std::cout << "Devices:" << std::endl;
-        for (int i = 0; i < count; i++)
-        {
-            std::cout << i << ": " << getDeviceName(i) << std::endl;
-            check_cuda_error();
-        }
-        std::cout << std::endl;
-    }
-}
-
--- a/main.cpp
+++ b/main.cpp
@ -0,0 +1,332 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include <iostream>
+#include <vector>
+#include <numeric>
+#include <cmath>
+#include <limits>
+#include <chrono>
+#include <algorithm>
+#include <iomanip>
+#include <cstring>
+
+#include "common.h"
+#include "Stream.h"
+
+#if defined(CUDA)
+#include "CUDAStream.h"
+#elif defined(OCL)
+#include "OCLStream.h"
+#elif defined(USE_RAJA)
+#include "RAJAStream.hpp"
+#elif defined(KOKKOS)
+#include "KOKKOSStream.hpp"
+#elif defined(ACC)
+#include "ACCStream.h"
+#elif defined(SYCL)
+#include "SYCLStream.h"
+#elif defined(OMP3)
+#include "OMP3Stream.h"
+#elif defined(OMP45)
+#include "OMP45Stream.h"
+#endif
+
+// Default size of 2^25
+unsigned int ARRAY_SIZE = 33554432;
+unsigned int num_times = 100;
+unsigned int deviceIndex = 0;
+bool use_float = false;
+
+
+template <typename T>
+void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
+
+template <typename T>
+void run();
+
+void parseArguments(int argc, char *argv[]);
+
+int main(int argc, char *argv[])
+{
+  std::cout
+    << "GPU-STREAM" << std::endl
+    << "Version: " << VERSION_STRING << std::endl
+    << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
+
+  parseArguments(argc, argv);
+
+  // TODO: Fix SYCL to allow multiple template specializations
+#ifndef SYCL
+#ifndef KOKKOS
+  if (use_float)
+    run<float>();
+  else
+#endif
+#endif
+    run<double>();
+
+}
+
+template <typename T>
+void run()
+{
+  std::cout << "Running kernels " << num_times << " times" << std::endl;
+
+  if (sizeof(T) == sizeof(float))
+    std::cout << "Precision: float" << std::endl;
+  else
+    std::cout << "Precision: double" << std::endl;
+
+  // Create host vectors
+  std::vector<T> a(ARRAY_SIZE, 0.1);
+  std::vector<T> b(ARRAY_SIZE, 0.2);
+  std::vector<T> c(ARRAY_SIZE, 0.0);
+  std::streamsize ss = std::cout.precision();
+  std::cout << std::setprecision(1) << std::fixed
+    << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
+    << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
+  std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
+    << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
+  std::cout.precision(ss);
+
+  Stream<T> *stream;
+
+#if defined(CUDA)
+  // Use the CUDA implementation
+  stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(OCL)
+  // Use the OpenCL implementation
+  stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(USE_RAJA)
+  // Use the RAJA implementation
+  stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(KOKKOS)
+  // Use the Kokkos implementation
+  stream = new KOKKOSStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(ACC)
+  // Use the OpenACC implementation
+  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
+
+#elif defined(SYCL)
+  // Use the SYCL implementation
+  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
+
+#elif defined(OMP3)
+  // Use the "reference" OpenMP 3 implementation
+  stream = new OMP3Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
+
+#elif defined(OMP45)
+  // Use the "reference" OpenMP 3 implementation
+  stream = new OMP45Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
+
+#endif
+
+  stream->write_arrays(a, b, c);
+
+  // List of times
+  std::vector<std::vector<double>> timings(4);
+
+  // Declare timers
+  std::chrono::high_resolution_clock::time_point t1, t2;
+
+  // Main loop
+  for (unsigned int k = 0; k < num_times; k++)
+  {
+    // Execute Copy
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->copy();
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+    // Execute Mul
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->mul();
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+    // Execute Add
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->add();
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+    // Execute Triad
+    t1 = std::chrono::high_resolution_clock::now();
+    stream->triad();
+    t2 = std::chrono::high_resolution_clock::now();
+    timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
+
+  }
+
+  // Check solutions
+  stream->read_arrays(a, b, c);
+  check_solution<T>(num_times, a, b, c);
+
+  // Display timing results
+  std::cout
+    << std::left << std::setw(12) << "Function"
+    << std::left << std::setw(12) << "MBytes/sec"
+    << std::left << std::setw(12) << "Min (sec)"
+    << std::left << std::setw(12) << "Max"
+    << std::left << std::setw(12) << "Average" << std::endl;
+
+  std::cout << std::fixed;
+
+  std::string labels[4] = {"Copy", "Mul", "Add", "Triad"};
+  size_t sizes[4] = {
+    2 * sizeof(T) * ARRAY_SIZE,
+    2 * sizeof(T) * ARRAY_SIZE,
+    3 * sizeof(T) * ARRAY_SIZE,
+    3 * sizeof(T) * ARRAY_SIZE
+  };
+
+  for (int i = 0; i < 4; i++)
+  {
+    // Get min/max; ignore the first result
+    auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
+
+    // Calculate average; ignore the first result
+    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
+
+    // Display results
+    std::cout
+      << std::left << std::setw(12) << labels[i]
+      << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first)
+      << std::left << std::setw(12) << std::setprecision(5) << *minmax.first
+      << std::left << std::setw(12) << std::setprecision(5) << *minmax.second
+      << std::left << std::setw(12) << std::setprecision(5) << average
+      << std::endl;
+
+  }
+
+  delete stream;
+
+}
+
+template <typename T>
+void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  // Generate correct solution
+  T goldA = 0.1;
+  T goldB = 0.2;
+  T goldC = 0.0;
+
+  const T scalar = 0.3;
+
+  for (unsigned int i = 0; i < ntimes; i++)
+  {
+    // Do STREAM!
+    goldC = goldA;
+    goldB = scalar * goldC;
+    goldC = goldA + goldB;
+    goldA = goldB + scalar * goldC;
+  }
+
+  // Calculate the average error
+  double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
+  errA /= a.size();
+  double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
+  errB /= b.size();
+  double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
+  errC /= c.size();
+
+  double epsi = std::numeric_limits<T>::epsilon() * 100.0;
+
+  if (errA > epsi)
+    std::cerr
+      << "Validation failed on a[]. Average error " << errA
+      << std::endl;
+  if (errB > epsi)
+    std::cerr
+      << "Validation failed on b[]. Average error " << errB
+      << std::endl;
+  if (errC > epsi)
+    std::cerr
+      << "Validation failed on c[]. Average error " << errC
+      << std::endl;
+
+}
+
+int parseUInt(const char *str, unsigned int *output)
+{
+  char *next;
+  *output = strtoul(str, &next, 10);
+  return !strlen(next);
+}
+
+void parseArguments(int argc, char *argv[])
+{
+  for (int i = 1; i < argc; i++)
+  {
+    if (!std::string("--list").compare(argv[i]))
+    {
+      listDevices();
+      exit(EXIT_SUCCESS);
+    }
+    else if (!std::string("--device").compare(argv[i]))
+    {
+      if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
+      {
+        std::cerr << "Invalid device index." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+    else if (!std::string("--arraysize").compare(argv[i]) ||
+             !std::string("-s").compare(argv[i]))
+    {
+      if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
+      {
+        std::cerr << "Invalid array size." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+    else if (!std::string("--numtimes").compare(argv[i]) ||
+             !std::string("-n").compare(argv[i]))
+    {
+      if (++i >= argc || !parseUInt(argv[i], &num_times))
+      {
+        std::cerr << "Invalid number of times." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+      if (num_times < 2)
+      {
+        std::cerr << "Number of times must be 2 or more" << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+    else if (!std::string("--float").compare(argv[i]))
+    {
+      use_float = true;
+    }
+    else if (!std::string("--help").compare(argv[i]) ||
+             !std::string("-h").compare(argv[i]))
+    {
+      std::cout << std::endl;
+      std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
+      std::cout << "Options:" << std::endl;
+      std::cout << "  -h  --help               Print the message" << std::endl;
+      std::cout << "      --list               List available devices" << std::endl;
+      std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
+      std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
+      std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
+      std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
+      std::cout << std::endl;
+      exit(EXIT_SUCCESS);
+    }
+    else
+    {
+      std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')"
+                << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+}
--- a/ocl-stream-kernels.cl
+++ b/ocl-stream-kernels.cl
@ -1,70 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-
-#ifdef FLOAT
-	#define DATATYPE float
-	constant DATATYPE scalar = 3.0f;
-#else
-	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-	#define DATATYPE double
-	constant DATATYPE scalar = 3.0;
-#endif
-
-
-kernel void copy(global const DATATYPE * restrict a, global DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	c[i] = a[i];
-}
-
-kernel void mul(global DATATYPE * restrict b, global const DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	b[i] = scalar * c[i];
-}
-
-kernel void add(global const DATATYPE * restrict a, global const DATATYPE * restrict b, global DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	c[i] = a[i] + b[i];
-}
-
-kernel void triad(global DATATYPE * restrict a, global const DATATYPE * restrict b, global const DATATYPE * restrict c)
-{
-	const size_t i = get_global_id(0);
-	a[i] = b[i] + scalar * c[i];
-}
--- a/ocl-stream.cpp
+++ b/ocl-stream.cpp
@ -1,488 +0,0 @@
-/*=============================================================================
-*------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
-* Based on John D. McCalpin’s original STREAM benchmark for CPUs
-*------------------------------------------------------------------------------
-* License:
-*  1. You are free to use this program and/or to redistribute
-*     this program.
-*  2. You are free to modify this program for your own use,
-*     including commercial use, subject to the publication
-*     restrictions in item 3.
-*  3. You are free to publish results obtained from running this
-*     program, or from works that you derive from this program,
-*     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
-*         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
-*         and incorporated herein by reference.
-*         The copyright holders retain the
-*         right to determine conformity with the Run Rules.
-*     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
-*         labelled whenever they are published.  Examples of
-*         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
-*         Other comparable, clear and reasonable labelling is
-*         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
-*         is encouraged, but not required.
-*  4. Use of this program or creation of derived works based on this
-*     program constitutes acceptance of these licensing restrictions.
-*  5. Absolutely no warranty is expressed or implied.
-*———————————————————————————————————-----------------------------------------*/
-
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <chrono>
-#include <cfloat>
-#include <cmath>
-
-#define CL_HPP_ENABLE_EXCEPTIONS
-#define CL_HPP_MINIMUM_OPENCL_VERSION 110
-#define CL_HPP_TARGET_OPENCL_VERSION 110
-#include "CL/cl2.hpp"
-#include "common.h"
-
-std::string getDeviceName(const cl::Device& device);
-std::string getDeviceDriver(const cl::Device& device);
-unsigned getDeviceList(std::vector<cl::Device>& devices);
-
-
-// Print error and exit
-void die(std::string msg, cl::Error& e)
-{
-    std::cerr
-            << "Error: "
-            << msg
-            << ": " << e.what()
-            << "(" << e.err() << ")"
-            << std::endl;
-    exit(e.err());
-}
-
-
-int main(int argc, char *argv[])
-{
-
-    // Print out run information
-    std::cout
-        << "GPU-STREAM" << std::endl
-        << "Version: " << VERSION_STRING << std::endl
-        << "Implementation: OpenCL" << std::endl;
-
-    std::string status;
-
-    try
-    {
-        parseArguments(argc, argv);
-        if (NTIMES < 2)
-            throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
-
-
-        std::cout << "Precision: ";
-        if (useFloat) std::cout << "float";
-        else std::cout << "double";
-        std::cout << std::endl << std::endl;
-
-        std::cout << "Running kernels " << NTIMES << " times" << std::endl;
-
-        if (ARRAY_SIZE % 1024 != 0)
-        {
-            unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
-            ARRAY_SIZE -= ARRAY_SIZE % 1024;
-            std::cout
-                << "Warning: array size must divide 1024" << std::endl
-                << "Resizing array from " << OLD_ARRAY_SIZE
-                << " to " << ARRAY_SIZE << std::endl;
-            if (ARRAY_SIZE == 0)
-                throw std::runtime_error("Array size must be >= 1024");
-        }
-
-        // Get precision (used to reset later)
-        std::streamsize ss = std::cout.precision();
-
-        size_t DATATYPE_SIZE;
-
-        if (useFloat)
-        {
-            DATATYPE_SIZE = sizeof(float);
-        }
-        else
-        {
-            DATATYPE_SIZE = sizeof(double);
-        }
-
-        // Display number of bytes in array
-        std::cout << std::setprecision(1) << std::fixed
-            << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-            << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-            << std::endl;
-        std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
-            << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
-            << std::endl;
-
-        // Reset precision
-        std::cout.precision(ss);
-
-        // Open the Kernel source
-        std::string kernels;
-        {
-            std::ifstream in("ocl-stream-kernels.cl");
-            if (!in.is_open())
-                throw std::runtime_error("Cannot open kernel file");
-            kernels = std::string (std::istreambuf_iterator<char>(in), (std::istreambuf_iterator<char>()));
-        }
-
-
-        // Setup OpenCL
-
-        // Get list of devices
-        std::vector<cl::Device> devices;
-        getDeviceList(devices);
-
-        // Check device index is in range
-        if (deviceIndex >= devices.size())
-            throw std::runtime_error("Chosen device index is invalid");
-
-        cl::Device device = devices[deviceIndex];
-
-        status = "Creating context";
-        cl::Context context(device);
-
-        status = "Creating queue";
-        cl::CommandQueue queue(context);
-
-        status = "Creating program";
-        cl::Program program(context, kernels);
-
-        // Print out device name
-        std::string name = getDeviceName(device);
-        std::cout << "Using OpenCL device " << name << std::endl;
-
-        // Print out OpenCL driver version for this device
-        std::string driver = getDeviceDriver(device);
-        std::cout << "Driver: " << driver << std::endl;
-
-        // Check device can do double precision if requested
-        if (!useFloat && !device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
-            throw std::runtime_error("Device does not support double precision, please use --float");
-
-        // Check buffers fit on the device
-        status = "Getting device memory sizes";
-        cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
-        cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
-        if (maxbuffer < DATATYPE_SIZE*ARRAY_SIZE)
-            throw std::runtime_error("Device cannot allocate a buffer big enough");
-        if (totalmem < 3*DATATYPE_SIZE*ARRAY_SIZE)
-            throw std::runtime_error("Device does not have enough memory for all 3 buffers");
-
-        try
-        {
-            std::string options = "";
-            if (useFloat)
-                options = "-DFLOAT";
-            program.build(options.c_str());
-        }
-        catch (cl::Error& e)
-        {
-            std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
-            std::string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
-            std::cerr
-                << "Build error:"
-                << buildlog
-                << std::endl;
-            throw e;
-        }
-
-        status = "Making kernel copy";
-        auto copy = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "copy");
-        status = "Making kernel mul";
-        auto mul = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "mul");
-        status = "Making kernel add";
-        auto add = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "add");
-        status = "Making kernel triad";
-        auto triad = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "triad");
-
-        // Create host vectors
-        void *h_a = malloc(ARRAY_SIZE * DATATYPE_SIZE);
-        void *h_b = malloc(ARRAY_SIZE * DATATYPE_SIZE);
-        void *h_c = malloc(ARRAY_SIZE * DATATYPE_SIZE);
-
-        // Initilise arrays
-        for (unsigned int i = 0; i < ARRAY_SIZE; i++)
-        {
-            if (useFloat)
-            {
-                ((float*)h_a)[i] = 1.0f;
-                ((float*)h_b)[i] = 2.0f;
-                ((float*)h_c)[i] = 0.0f;
-            }
-            else
-            {
-                ((double*)h_a)[i] = 1.0;
-                ((double*)h_b)[i] = 2.0;
-                ((double*)h_c)[i] = 0.0;
-            }
-        }
-
-        // Create device buffers
-        status = "Creating buffers";
-        cl::Buffer d_a(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
-        cl::Buffer d_b(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
-        cl::Buffer d_c(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
-
-
-        // Copy host memory to device
-        status = "Copying buffers";
-        queue.enqueueWriteBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
-        queue.enqueueWriteBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
-        queue.enqueueWriteBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
-
-        // Make sure the copies are finished
-        queue.finish();
-
-
-        // List of times
-        std::vector< std::vector<double> > timings;
-
-        // Declare timers
-        std::chrono::high_resolution_clock::time_point t1, t2;
-
-        // Main loop
-        for (unsigned int k = 0; k < NTIMES; k++)
-        {
-            status = "Executing copy";
-            std::vector<double> times;
-            t1 = std::chrono::high_resolution_clock::now();
-            copy(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_a, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-            status = "Executing mul";
-            t1 = std::chrono::high_resolution_clock::now();
-            mul(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_b, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-            status = "Executing add";
-            t1 = std::chrono::high_resolution_clock::now();
-            add(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_a, d_b, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-
-            status = "Executing triad";
-            t1 = std::chrono::high_resolution_clock::now();
-            triad(
-                cl::EnqueueArgs(
-                queue,
-                cl::NDRange(ARRAY_SIZE)),
-                d_a, d_b, d_c);
-            queue.finish();
-            t2 = std::chrono::high_resolution_clock::now();
-            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
-
-            timings.push_back(times);
-
-        }
-
-        // Check solutions
-        status = "Copying back buffers";
-        queue.enqueueReadBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
-        queue.enqueueReadBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
-        queue.enqueueReadBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
-        queue.finish();
-
-
-        if (useFloat)
-        {
-            check_solution<float>(h_a, h_b, h_c);
-        }
-        else
-        {
-            check_solution<double>(h_a, h_b, h_c);
-        }
-
-        // Crunch results
-        size_t sizes[4] = {
-            2 * DATATYPE_SIZE * ARRAY_SIZE,
-            2 * DATATYPE_SIZE * ARRAY_SIZE,
-            3 * DATATYPE_SIZE * ARRAY_SIZE,
-            3 * DATATYPE_SIZE * ARRAY_SIZE
-        };
-        double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
-        double max[4] = {0.0, 0.0, 0.0, 0.0};
-        double avg[4] = {0.0, 0.0, 0.0, 0.0};
-        // Ignore first result
-        for (unsigned int i = 1; i < NTIMES; i++)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                avg[j] += timings[i][j];
-                min[j] = std::min(min[j], timings[i][j]);
-                max[j] = std::max(max[j], timings[i][j]);
-            }
-        }
-        for (int j = 0; j < 4; j++)
-            avg[j] /= (double)(NTIMES-1);
-
-        // Display results
-        std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
-        std::cout
-            << std::left << std::setw(12) << "Function"
-            << std::left << std::setw(12) << "MBytes/sec"
-            << std::left << std::setw(12) << "Min (sec)"
-            << std::left << std::setw(12) << "Max"
-            << std::left << std::setw(12) << "Average"
-            << std::endl;
-        for (int j = 0; j < 4; j++)
-        {
-            std::cout
-                << std::left << std::setw(12) << labels[j]
-                << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
-                << std::left << std::setw(12) << std::setprecision(5) << min[j]
-                << std::left << std::setw(12) << std::setprecision(5) << max[j]
-                << std::left << std::setw(12) << std::setprecision(5) << avg[j]
-                << std::endl;
-        }
-
-        // Free host vectors
-        free(h_a);
-        free(h_b);
-        free(h_c);
-
-    }
-    catch (cl::Error &e)
-    {
-        die(status, e);
-    }
-    catch (std::exception& e)
-    {
-        std::cerr
-            << "Error: "
-            << e.what()
-            << std::endl;
-        exit(EXIT_FAILURE);
-    }
-
-}
-
-
-unsigned getDeviceList(std::vector<cl::Device>& devices)
-{
-    // Get list of platforms
-    std::vector<cl::Platform> platforms;
-    try
-    {
-        cl::Platform::get(&platforms);
-    }
-    catch (cl::Error &e)
-    {
-        die("Getting platforms", e);
-    }
-
-    // Enumerate devices
-    for (unsigned int i = 0; i < platforms.size(); i++)
-    {
-        std::vector<cl::Device> plat_devices;
-        try
-        {
-            platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
-        }
-        catch (cl::Error &e)
-        {
-            die("Getting devices", e);
-        }
-        devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
-    }
-
-    return devices.size();
-}
-
-
-std::string getDeviceName(const cl::Device& device)
-{
-    std::string name;
-    cl_device_info info = CL_DEVICE_NAME;
-
-    try
-    {
-
-        // Special case for AMD
-#ifdef CL_DEVICE_BOARD_NAME_AMD
-        device.getInfo(CL_DEVICE_VENDOR, &name);
-        if (strstr(name.c_str(), "Advanced Micro Devices"))
-            info = CL_DEVICE_BOARD_NAME_AMD;
-#endif
-
-        device.getInfo(info, &name);
-    }
-    catch (cl::Error &e)
-    {
-        die("Getting device name", e);
-    }
-
-    return name;
-}
-
-std::string getDeviceDriver(const cl::Device& device)
-{
-    std::string driver;
-    try
-    {
-        device.getInfo(CL_DRIVER_VERSION, &driver);
-    }
-    catch (cl::Error &e)
-    {
-        die("Getting device driver", e);
-    }
-
-    return driver;
-}
-
-
-void listDevices(void)
-{
-    // Get list of devices
-    std::vector<cl::Device> devices;
-    getDeviceList(devices);
-
-    // Print device names
-    if (devices.size() == 0)
-    {
-        std::cout << "No devices found." << std::endl;
-    }
-    else
-    {
-        std::cout << std::endl;
-        std::cout << "Devices:" << std::endl;
-        for (unsigned i = 0; i < devices.size(); i++)
-        {
-            std::cout << i << ": " << getDeviceName(devices[i]) << std::endl;
-        }
-        std::cout << std::endl;
-    }
-}
-
--- a/runcuda.sh
+++ b/runcuda.sh
@ -1,4 +0,0 @@
-./gpu-stream-cuda
-./gpu-stream-cuda --groups 64 --groupSize 256
-./gpu-stream-cuda --float
-./gpu-stream-cuda --float --groups 64 --groupSize 256
--- a/runhip.sh
+++ b/runhip.sh
@ -1,4 +0,0 @@
-./gpu-stream-hip
-./gpu-stream-hip --groups 64 --groupSize 256
-./gpu-stream-hip --float
-./gpu-stream-hip --float --groups 64 --groupSize 256