Merge branch 'refactor'

2016-06-30 16:51:40 +01:00 · 2016-06-30 16:51:40 +01:00 · eda9d56ed9
commit eda9d56ed9
parent b53fb1b3b2 04f321db85
34 changed files with 2469 additions and 2023 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,20 @@
 common.h
 gpu-stream-cuda
 gpu-stream-ocl
 gpu-stream-acc
 gpu-stream-omp3
 gpu-stream-omp45
 gpu-stream-sycl
 *.o
 *.tar
 *.gz
 .DS_Store
 CMakeCache.txt
 CMakeFiles/
 cmake_install.cmake
 Makefile
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@ -0,0 +1,141 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "ACCStream.h"
 template <class T>
 ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
 {
  acc_set_device_num(device, acc_device_nvidia);
  array_size = ARRAY_SIZE;
  // Set up data region on device
  this->a = a;
  this->b = b;
  this->c = c;
  #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 ACCStream<T>::~ACCStream()
 {
  // End data region on device
  unsigned int array_size = this->array_size;
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
 {
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 void ACCStream<T>::copy()
 {
  unsigned int array_size = this->array_size;
  T * restrict a = this->a;
  T * restrict c = this->c;
  #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait
  for (int i = 0; i < array_size; i++)
  {
    c[i] = a[i];
  }
 }
 template <class T>
 void ACCStream<T>::mul()
 {
  const T scalar = 0.3;
  unsigned int array_size = this->array_size;
  T * restrict b = this->b;
  T * restrict c = this->c;
  #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait
  for (int i = 0; i < array_size; i++)
  {
    b[i] = scalar * c[i];
  }
 }
 template <class T>
 void ACCStream<T>::add()
 {
  unsigned int array_size = this->array_size;
  T * restrict a = this->a;
  T * restrict b = this->b;
  T * restrict c = this->c;
  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
  for (int i = 0; i < array_size; i++)
  {
    c[i] = a[i] + b[i];
  }
 }
 template <class T>
 void ACCStream<T>::triad()
 {
  const T scalar = 0.3;
  unsigned int array_size = this->array_size;
  T * restrict a = this->a;
  T * restrict b = this->b;
  T * restrict c = this->c;
  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
  for (int i = 0; i < array_size; i++)
  {
    a[i] = b[i] + scalar * c[i];
  }
 }
 void listDevices(void)
 {
  // Get number of devices
  int count = acc_get_num_devices(acc_device_nvidia);
  // Print device list
  if (count == 0)
  {
    std::cerr << "No devices found." << std::endl;
  }
  else
  {
    std::cout << "There are " << count << " devices." << std::endl;
  }
 }
 std::string getDeviceName(const int)
 {
  return std::string("Device name unavailable");
 }
 std::string getDeviceDriver(const int)
 {
  return std::string("Device driver unavailable");
 }
 template class ACCStream<float>;
 template class ACCStream<double>;
--- a/ACCStream.h
+++ b/ACCStream.h
@ -0,0 +1,44 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include "Stream.h"
 #include <openacc.h>
 #define IMPLEMENTATION_STRING "OpenACC"
 template <class T>
 class ACCStream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Device side pointers
    T *a;
    T *b;
    T *c;
  public:
    ACCStream(const unsigned int, T*, T*, T*, int);
    ~ACCStream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/CL/cl2.hpp
+++ b/CL/cl2.hpp
@ -52,6 +52,18 @@
 *         #define CL_HPP_USE_DX_INTEROP
 *         cl_khr_sub_groups
 *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
 *
 *   Doxygen documentation for this header is available here:
 *
 *       http://khronosgroup.github.io/OpenCL-CLHPP/
 *
 *   The latest version of this header can be found on the GitHub releases page:
 *
 *       https://github.com/KhronosGroup/OpenCL-CLHPP/releases
 *
 *   Bugs and patches can be submitted to the GitHub repository:
 *
 *       https://github.com/KhronosGroup/OpenCL-CLHPP
 */
 /*! \mainpage
@ -134,41 +146,64 @@
 *
 * \section parameterization Parameters
 * This header may be parameterized by a set of preprocessor macros.
- * CL_HPP_TARGET_OPENCL_VERSION
+ *
- *  - Defines the target OpenCL runtime version to build the header against.
+ * - CL_HPP_TARGET_OPENCL_VERSION
- *    Defaults to 200, representing OpenCL 2.0.
+ *
- * CL_HPP_NO_STD_STRING
+ *   Defines the target OpenCL runtime version to build the header
- *  - Do not use the standard library string class.
+ *   against. Defaults to 200, representing OpenCL 2.0.
- *    cl::string is not defined and may be defined by the user before 
+ *
- *    cl2.hpp is included.
+ * - CL_HPP_NO_STD_STRING
- * CL_HPP_NO_STD_VECTOR
+ *
- *  - Do not use the standard library vector class.
+ *   Do not use the standard library string class. cl::string is not
- *    cl::vector is not defined and may be defined by the user before
+ *   defined and may be defined by the user before cl2.hpp is
- *    cl2.hpp is included. 
+ *   included.
- * CL_HPP_NO_STD_ARRAY
+ *
- *  - Do not use the standard library array class.
+ * - CL_HPP_NO_STD_VECTOR
- *    cl::array is not defined and may be defined by the user before
+ *
- *    cl2.hpp is included.
+ *   Do not use the standard library vector class. cl::vector is not
- * CL_HPP_NO_STD_UNIQUE_PTR
+ *   defined and may be defined by the user before cl2.hpp is
- *  - Do not use the standard library unique_ptr class.
+ *   included.
- *    cl::pointer and the cl::allocate_pointer function are not defined 
+ *
- *    and may be defined by the user before cl2.hpp is included.
+ * - CL_HPP_NO_STD_ARRAY
- * CL_HPP_ENABLE_DEVICE_FISSION 
+ *
- *  - Enables device fission for OpenCL 1.2 platforms
+ *   Do not use the standard library array class. cl::array is not
- * CL_HPP_ENABLE_EXCEPTIONS
+ *   defined and may be defined by the user before cl2.hpp is
- *  - Enable exceptions for use in the C++ bindings header.
+ *   included.
- *    This is the preferred error handling mechanism but is not required.
+ *
- * CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
+ * - CL_HPP_NO_STD_UNIQUE_PTR
- *  - Backward compatibility option to support cl.hpp-style size_t class.
+ *
- *    Replaces the updated std::array derived version and removal of size_t
+ *   Do not use the standard library unique_ptr class. cl::pointer and
- *    from the namespace. Note that in this case the new size_t class
+ *   the cl::allocate_pointer functions are not defined and may be
- *    is placed in the cl::compatibility namespace and thus requires
+ *   defined by the user before cl2.hpp is included.
- *    an additional using declaration for direct backward compatibility.
+ *
- * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 
+ * - CL_HPP_ENABLE_DEVICE_FISSION
- *  - Enable older vector of pairs interface for construction of programs.
+ *
- * CL_HPP_CL_1_2_DEFAULT_BUILD
+ *   Enables device fission for OpenCL 1.2 platforms.
- *  - Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
+ *
- *  - applies to use of cl::Program construction and other program build variants.
+ * - CL_HPP_ENABLE_EXCEPTIONS
 *
 *   Enable exceptions for use in the C++ bindings header. This is the
 *   preferred error handling mechanism but is not required.
 *
 * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
 *
 *   Backward compatibility option to support cl.hpp-style size_t
 *   class.  Replaces the updated std::array derived version and
 *   removal of size_t from the namespace. Note that in this case the
 *   new size_t class is placed in the cl::compatibility namespace and
 *   thus requires an additional using declaration for direct backward
 *   compatibility.
 *
 * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
 *
 *   Enable older vector of pairs interface for construction of
 *   programs.
 *
 * - CL_HPP_CL_1_2_DEFAULT_BUILD
 *
 *   Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
 *   applies to use of cl::Program construction and other program
 *   build variants.
 *
 *
 * \section example Example
@ -213,34 +248,44 @@
            return -1;
        }
-        std::string kernel1{
+        // Use C++11 raw string literals for kernel source code
-            "global int globalA;"
+        std::string kernel1{R"CLC(
-            "kernel void updateGlobal(){"
+            global int globalA;
-            "  globalA = 75;"
+            kernel void updateGlobal()
-            "}"};
+            {
-        std::string kernel2{
+              globalA = 75;
-            "typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, global int *output, int val, write_only pipe int outPipe, queue_t childQueue){"
+            }
-            "  output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);"
+        )CLC"};
-            "  write_pipe(outPipe, &val);"
+        std::string kernel2{R"CLC(
-            "  queue_t default_queue = get_default_queue(); "
+            typedef struct { global int *bar; } Foo;
-            "  ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); "
+            kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
-            // Have a child kernel write into third quarter of output
+                                  global int *output, int val, write_only pipe int outPipe, queue_t childQueue)
-            "  enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
+            {
-            "    ^{"
+              output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);
-            "      output[get_global_size(0)*2 + get_global_id(0)] = inputA[get_global_size(0)*2+get_global_id(0)] + inputB[get_global_size(0)*2+get_global_id(0)] + globalA;"
+              write_pipe(outPipe, &val);
-            "    });"
+              queue_t default_queue = get_default_queue();
-            // Have a child kernel write into last quarter of output
+              ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);
-            "  enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
+
-            "    ^{"
+              // Have a child kernel write into third quarter of output
-            "      output[get_global_size(0)*3 + get_global_id(0)] = inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;"
+              enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
-            "    });"
+                ^{
-            "}" };
+                    output[get_global_size(0)*2 + get_global_id(0)] =
                      inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;
                });
              // Have a child kernel write into last quarter of output
              enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
                ^{
                    output[get_global_size(0)*3 + get_global_id(0)] =
                      inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;
                });
            }
        )CLC"};
        // New simpler string interface style
        std::vector<std::string> programStrings {kernel1, kernel2};
-        cl::Program vectorAddProgram(
+        cl::Program vectorAddProgram(programStrings);
            programStrings);
        try {
            vectorAddProgram.build("-cl-std=CL2.0");
        }
@ -268,9 +313,9 @@
        //////////////////
        // SVM allocations
-        cl::pointer<int> anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
+        auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
        *anSVMInt = 5;
-        cl::SVMAllocator<int, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
+        cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
        auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
        fooPointer->bar = anSVMInt.get();
        cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
@ -3538,7 +3583,7 @@ template <class T, class Alloc, class... Args>
 cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
 {
    Alloc alloc(alloc_);
-    static const size_t copies = 1;
+    static const size_type copies = 1;
    // Ensure that creation of the management block and the
    // object are dealt with separately such that we only provide a deleter
@ -6520,7 +6565,7 @@ inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned
        // Resize the parameter array and constituent arrays
        param->resize(numBinaries);
-        for (int i = 0; i < numBinaries; ++i) {
+        for (size_type i = 0; i < numBinaries; ++i) {
            (*param)[i].resize(sizes[i]);
        }
@ -7107,7 +7152,7 @@ public:
        size_type buffer_slice_pitch,
        size_type host_row_pitch,
        size_type host_slice_pitch,
-        void *ptr,
+        const void *ptr,
        const vector<Event>* events = NULL,
        Event* event = NULL) const
    {
@ -7255,7 +7300,7 @@ public:
        const array<size_type, 3>& region,
        size_type row_pitch,
        size_type slice_pitch,
-        void* ptr,
+        const void* ptr,
        const vector<Event>* events = NULL,
        Event* event = NULL) const
    {
@ -7845,7 +7890,7 @@ public:
    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
        const Kernel& kernel,
        const vector<Event>* events = NULL,
-        Event* event = NULL) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED const
+        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
@ -8873,7 +8918,7 @@ inline cl_int enqueueWriteBufferRect(
    size_type buffer_slice_pitch,
    size_type host_row_pitch,
    size_type host_slice_pitch,
-    void *ptr,
+    const void *ptr,
    const vector<Event>* events = NULL,
    Event* event = NULL)
 {
@ -8971,7 +9016,7 @@ inline cl_int enqueueWriteImage(
    const array<size_type, 3>& region,
    size_type row_pitch,
    size_type slice_pitch,
-    void* ptr,
+    const void* ptr,
    const vector<Event>* events = NULL,
    Event* event = NULL)
 {
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,164 @@
 cmake_minimum_required(VERSION 3.2)
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 project(gpu-stream)
 include(CheckIncludeFileCXX)
 include(CheckCXXCompilerFlag)
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 0)
 configure_file(common.h.in common.h)
 # If using the Cray compiler, manually add the C++11 flag because setting the
 # standard through CMake as above doesn't set this flag with Cray
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
  list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
 endif ()
 #-------------------------------------------------------------------------------
 # CUDA
 #-------------------------------------------------------------------------------
 find_package(CUDA 7.0 QUIET)
 set(FLAG True)
 if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
  execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
  if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
    message("Xcode version not supported by CUDA")
    set(FLAG False)
  endif ()
 endif ()
 if (${FLAG} AND ${CUDA_FOUND})
  list(APPEND CUDA_NVCC_FLAGS --std=c++11)
  cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
  target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
 else ()
  message("Skipping CUDA...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenCL
 #-------------------------------------------------------------------------------
 find_package(OpenCL QUIET)
 if (${OpenCL_FOUND})
  add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
  target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
  target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
 else ()
  message("Skipping OpenCL...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenACC
 #-------------------------------------------------------------------------------
 # Check compiler supports an OpenACC flag
 include(CheckCXXCompilerFlag)
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
  CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
  if (OPENACC)
    list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
  endif ()
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
  CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
  CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
 endif ()
 if (OPENACC)
  add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
  target_compile_definitions(gpu-stream-acc PUBLIC ACC)
 else ()
  message("Skipping OpenACC...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenMP 3.0
 #-------------------------------------------------------------------------------
 find_package(OpenMP QUIET)
 if (${OpenMP_FOUND})
  add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
  target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
 else ()
  message("Skipping OpenMP 3...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenMP 4.5
 #-------------------------------------------------------------------------------
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
    add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
    target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
  endif ()
 endif ()
 #-------------------------------------------------------------------------------
 # RAJA
 #-------------------------------------------------------------------------------
 if (RAJA_PATH)
  find_package(OpenMP)
  find_package(CUDA 7.5)
  list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
  list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
  list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
  list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
  cuda_include_directories(${RAJA_PATH}/include)
  set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
  cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
  target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
  target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
 else()
  message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
 endif()
 #-------------------------------------------------------------------------------
 # Kokkos
 #-------------------------------------------------------------------------------
 if (KOKKOS_PATH)
  if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
  else()
    message("Skipping Kokkos (requires Linux)")
  endif()
 else()
  message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
 endif()
 #-------------------------------------------------------------------------------
 # SYCL
 #-------------------------------------------------------------------------------
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
    "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
  # Use C++14 if available, otherwise drop back to C++11
  check_cxx_compiler_flag("-std=c++14" CXX14)
  if (CXX14)
    set(CMAKE_REQUIRED_FLAGS "-std=c++14")
  else()
    set(CMAKE_REQUIRED_FLAGS "-std=c++11")
  endif()
 endif()
 check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
 if (HAS_SYCL)
  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
  find_program(COMPUTECPP "compute++")
  if (COMPUTECPP)
    message(STATUS "Using ComputeCpp for SYCL compilation")
    add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c)
    add_dependencies(gpu-stream-sycl SYCLStream.sycl)
    target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl)
    target_link_libraries(gpu-stream-sycl SYCL OpenCL)
  else()
    message(STATUS "Using header-only SYCL implementation")
    set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
  endif(COMPUTECPP)
 else ()
  message("Skipping SYCL...")
 endif (HAS_SYCL)
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@ -0,0 +1,214 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "CUDAStream.h"
 #define TBSIZE 1024
 void check_error(void)
 {
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
  {
    std::cerr << "Error: " << cudaGetErrorString(err) << std::endl;
    exit(err);
  }
 }
 template <class T>
 CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
  // The array size must be divisible by TBSIZE for kernel launches
  if (ARRAY_SIZE % TBSIZE != 0)
  {
    std::stringstream ss;
    ss << "Array size must be a multiple of " << TBSIZE;
    throw std::runtime_error(ss.str());
  }
  // Set device
  int count;
  cudaGetDeviceCount(&count);
  check_error();
  if (device_index >= count)
    throw std::runtime_error("Invalid device index");
  cudaSetDevice(device_index);
  check_error();
  // Print out device information
  std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
  array_size = ARRAY_SIZE;
  // Check buffers fit on the device
  cudaDeviceProp props;
  cudaGetDeviceProperties(&props, 0);
  if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
    throw std::runtime_error("Device does not have enough memory for all 3 buffers");
  // Create device buffers
  cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
  check_error();
  cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T));
  check_error();
  cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T));
  check_error();
 }
 template <class T>
 CUDAStream<T>::~CUDAStream()
 {
  cudaFree(d_a);
  check_error();
  cudaFree(d_b);
  check_error();
  cudaFree(d_c);
  check_error();
 }
 template <class T>
 void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
  // Copy host memory to device
  cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice);
  check_error();
  cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice);
  check_error();
  cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
  check_error();
 }
 template <class T>
 void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
  // Copy device memory to host
  cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost);
  check_error();
  cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost);
  check_error();
  cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost);
  check_error();
 }
 template <typename T>
 __global__ void copy_kernel(const T * a, T * c)
 {
  const int i = blockDim.x * blockIdx.x + threadIdx.x;
  c[i] = a[i];
 }
 template <class T>
 void CUDAStream<T>::copy()
 {
  copy_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_c);
  check_error();
  cudaDeviceSynchronize();
  check_error();
 }
 template <typename T>
 __global__ void mul_kernel(T * b, const T * c)
 {
  const T scalar = 0.3;
  const int i = blockDim.x * blockIdx.x + threadIdx.x;
  b[i] = scalar * c[i];
 }
 template <class T>
 void CUDAStream<T>::mul()
 {
  mul_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_b, d_c);
  check_error();
  cudaDeviceSynchronize();
  check_error();
 }
 template <typename T>
 __global__ void add_kernel(const T * a, const T * b, T * c)
 {
  const int i = blockDim.x * blockIdx.x + threadIdx.x;
  c[i] = a[i] + b[i];
 }
 template <class T>
 void CUDAStream<T>::add()
 {
  add_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
  check_error();
  cudaDeviceSynchronize();
  check_error();
 }
 template <typename T>
 __global__ void triad_kernel(T * a, const T * b, const T * c)
 {
  const T scalar = 0.3;
  const int i = blockDim.x * blockIdx.x + threadIdx.x;
  a[i] = b[i] + scalar * c[i];
 }
 template <class T>
 void CUDAStream<T>::triad()
 {
  triad_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
  check_error();
  cudaDeviceSynchronize();
  check_error();
 }
 void listDevices(void)
 {
  // Get number of devices
  int count;
  cudaGetDeviceCount(&count);
  check_error();
  // Print device names
  if (count == 0)
  {
    std::cerr << "No devices found." << std::endl;
  }
  else
  {
    std::cout << std::endl;
    std::cout << "Devices:" << std::endl;
    for (int i = 0; i < count; i++)
    {
      std::cout << i << ": " << getDeviceName(i) << std::endl;
    }
    std::cout << std::endl;
  }
 }
 std::string getDeviceName(const int device)
 {
  cudaDeviceProp props;
  cudaGetDeviceProperties(&props, device);
  check_error();
  return std::string(props.name);
 }
 std::string getDeviceDriver(const int device)
 {
  cudaSetDevice(device);
  check_error();
  int driver;
  cudaDriverGetVersion(&driver);
  check_error();
  return std::to_string(driver);
 }
 template class CUDAStream<float>;
 template class CUDAStream<double>;
--- a/CUDAStream.h
+++ b/CUDAStream.h
@ -0,0 +1,43 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include <sstream>
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "CUDA"
 template <class T>
 class CUDAStream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Device side pointers to arrays
    T *d_a;
    T *d_b;
    T *d_c;
  public:
    CUDAStream(const unsigned int, const int);
    ~CUDAStream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@ -0,0 +1,142 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "KOKKOSStream.hpp"
 using namespace Kokkos;
 template <class T>
 KOKKOSStream<T>::KOKKOSStream(
        const unsigned int ARRAY_SIZE, const int device_index)
    : array_size(ARRAY_SIZE)
 {
  Kokkos::initialize();
  d_a = new View<double*, DEVICE>("d_a", ARRAY_SIZE);
  d_b = new View<double*, DEVICE>("d_b", ARRAY_SIZE);
  d_c = new View<double*, DEVICE>("d_c", ARRAY_SIZE);
  hm_a = new View<double*, DEVICE>::HostMirror();
  hm_b = new View<double*, DEVICE>::HostMirror();
  hm_c = new View<double*, DEVICE>::HostMirror();
  *hm_a = create_mirror_view(*d_a);
  *hm_b = create_mirror_view(*d_b);
  *hm_c = create_mirror_view(*d_c);
 }
 template <class T>
 KOKKOSStream<T>::~KOKKOSStream()
 {
  finalize();
 }
 template <class T>
 void KOKKOSStream<T>::write_arrays(
        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
  for(int ii = 0; ii < array_size; ++ii)
  {
    (*hm_a)(ii) = a[ii];
    (*hm_b)(ii) = b[ii];
    (*hm_c)(ii) = c[ii];
  }
  deep_copy(*d_a, *hm_a);
  deep_copy(*d_b, *hm_b);
  deep_copy(*d_c, *hm_c);
 }
 template <class T>
 void KOKKOSStream<T>::read_arrays(
        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
  deep_copy(*hm_a, *d_a);
  deep_copy(*hm_b, *d_b);
  deep_copy(*hm_c, *d_c);
  for(int ii = 0; ii < array_size; ++ii)
  {
    a[ii] = (*hm_a)(ii);
    b[ii] = (*hm_b)(ii);
    c[ii] = (*hm_c)(ii);
  }
 }
 template <class T>
 void KOKKOSStream<T>::copy()
 {
  View<double*, DEVICE> a(*d_a);
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
  {
    c[index] = a[index];
  });
  Kokkos::fence();
 }
 template <class T>
 void KOKKOSStream<T>::mul()
 {
  View<double*, DEVICE> a(*d_a);
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
  const T scalar = 0.3;
  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
  {
    b[index] = scalar*c[index];
  });
  Kokkos::fence();
 }
 template <class T>
 void KOKKOSStream<T>::add()
 {
  View<double*, DEVICE> a(*d_a);
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
  {
    c[index] = a[index] + b[index];
  });
  Kokkos::fence();
 }
 template <class T>
 void KOKKOSStream<T>::triad()
 {
  View<double*, DEVICE> a(*d_a);
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
  const T scalar = 0.3;
  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
  {
    a[index] = b[index] + scalar*c[index];
  });
  Kokkos::fence();
 }
 void listDevices(void)
 {
  std::cout << "This is not the device you are looking for.";
 }
 std::string getDeviceName(const int device)
 {
  return "Kokkos";
 }
 std::string getDeviceDriver(const int device)
 {
  return "Kokkos";
 }
 //template class KOKKOSStream<float>;
 template class KOKKOSStream<double>;
--- a/KOKKOSStream.hpp
+++ b/KOKKOSStream.hpp
@ -0,0 +1,56 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_View.hpp>
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "KOKKOS"
 #ifdef KOKKOS_TARGET_CPU
  #define DEVICE Kokkos::OpenMP
 #else
  #define DEVICE Kokkos::Cuda
 #endif
 template <class T>
 class KOKKOSStream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Device side pointers to arrays
    Kokkos::View<double*, DEVICE>* d_a;
    Kokkos::View<double*, DEVICE>* d_b;
    Kokkos::View<double*, DEVICE>* d_c;
    Kokkos::View<double*>::HostMirror* hm_a;
    Kokkos::View<double*>::HostMirror* hm_b;
    Kokkos::View<double*>::HostMirror* hm_c;
  public:
    KOKKOSStream(const unsigned int, const int);
    ~KOKKOSStream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(
            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(
            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/11
+++ b/11
@ -0,0 +1,11 @@
 default: gpu-stream-kokkos
 include $(KOKKOS_PATH)/Makefile.kokkos
 gpu-stream-kokkos: main.o KOKKOSStream.o
 	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS)
 	$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 *==============================================================================
 *------------------------------------------------------------------------------
-* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
+* Copyright 2015-16: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
--- a/41
+++ b/41
@ -1,41 +0,0 @@
 LDLIBS = -l OpenCL
 CXXFLAGS = -std=c++11 -O3
 PLATFORM = $(shell uname -s)
 ifeq ($(PLATFORM), Darwin)
 	LDLIBS = -framework OpenCL
 endif
 all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip
 gpu-stream-ocl: ocl-stream.cpp common.o Makefile
 	$(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS)
 common.o: common.cpp common.h Makefile
 gpu-stream-cuda: cuda-stream.cu common.o Makefile
 ifeq ($(shell which nvcc > /dev/null; echo $$?), 0)
 	nvcc $(CXXFLAGS) common.o $< -o $@
 else
 	$(error "Cannot find nvcc, please install CUDA toolkit")
 endif
 HIP_PATH?=../../..
 HIPCC=$(HIP_PATH)/bin/hipcc
 hip-stream.o : hip-stream.cpp
 	$(HIPCC) $(CXXFLAGS) -c $< -o $@
 gpu-stream-hip: hip-stream.o common.o Makefile
 ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
 	$(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@
 else
 	$(error "Cannot find $(HIPCC), please install HIP toolkit")
 endif
 .PHONY: clean
 clean:
 	rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o
--- a/OCLStream.cpp
+++ b/OCLStream.cpp
@ -0,0 +1,255 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "OCLStream.h"
 // Cache list of devices
 bool cached = false;
 std::vector<cl::Device> devices;
 void getDeviceList(void);
 std::string kernels{R"CLC(
  constant TYPE scalar = 0.3;
  kernel void copy(
    global const TYPE * restrict a,
    global TYPE * restrict c)
  {
    const size_t i = get_global_id(0);
    c[i] = a[i];
  }
  kernel void mul(
    global TYPE * restrict b,
    global const TYPE * restrict c)
  {
    const size_t i = get_global_id(0);
    b[i] = scalar * c[i];
  }
  kernel void add(
    global const TYPE * restrict a,
    global const TYPE * restrict b,
    global TYPE * restrict c)
  {
    const size_t i = get_global_id(0);
    c[i] = a[i] + b[i];
  }
  kernel void triad(
    global TYPE * restrict a,
    global const TYPE * restrict b,
    global const TYPE * restrict c)
  {
    const size_t i = get_global_id(0);
    a[i] = b[i] + scalar * c[i];
  }
 )CLC"};
 template <class T>
 OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
  if (!cached)
    getDeviceList();
  // Setup default OpenCL GPU
  if (device_index >= devices.size())
    throw std::runtime_error("Invalid device index");
  device = devices[device_index];
  // Print out device information
  std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
  context = cl::Context(device);
  queue = cl::CommandQueue(context);
  // Create program
  cl::Program program(context, kernels);
  if (sizeof(T) == sizeof(double))
  {
    // Check device can do double
    if (!device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
      throw std::runtime_error("Device does not support double precision, please use --float");
    program.build("-DTYPE=double");
  }
  else if (sizeof(T) == sizeof(float))
    program.build("-DTYPE=float");
  // Create kernels
  copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
  mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
  add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
  triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
  array_size = ARRAY_SIZE;
  // Check buffers fit on the device
  cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
  cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
  if (maxbuffer < sizeof(T)*ARRAY_SIZE)
    throw std::runtime_error("Device cannot allocate a buffer big enough");
  if (totalmem < 3*sizeof(T)*ARRAY_SIZE)
    throw std::runtime_error("Device does not have enough memory for all 3 buffers");
  // Create buffers
  d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
  d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
  d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
 }
 template <class T>
 OCLStream<T>::~OCLStream()
 {
  delete copy_kernel;
  delete mul_kernel;
  delete add_kernel;
  delete triad_kernel;
 }
 template <class T>
 void OCLStream<T>::copy()
 {
  (*copy_kernel)(
    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
    d_a, d_c
  );
  queue.finish();
 }
 template <class T>
 void OCLStream<T>::mul()
 {
  (*mul_kernel)(
    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
    d_b, d_c
  );
  queue.finish();
 }
 template <class T>
 void OCLStream<T>::add()
 {
  (*add_kernel)(
    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
    d_a, d_b, d_c
  );
  queue.finish();
 }
 template <class T>
 void OCLStream<T>::triad()
 {
  (*triad_kernel)(
    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
    d_a, d_b, d_c
  );
  queue.finish();
 }
 template <class T>
 void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
  cl::copy(queue, a.begin(), a.end(), d_a);
  cl::copy(queue, b.begin(), b.end(), d_b);
  cl::copy(queue, c.begin(), c.end(), d_c);
 }
 template <class T>
 void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
  cl::copy(queue, d_a, a.begin(), a.end());
  cl::copy(queue, d_b, b.begin(), b.end());
  cl::copy(queue, d_c, c.begin(), c.end());
 }
 void getDeviceList(void)
 {
  // Get list of platforms
  std::vector<cl::Platform> platforms;
  cl::Platform::get(&platforms);
  // Enumerate devices
  for (unsigned i = 0; i < platforms.size(); i++)
  {
    std::vector<cl::Device> plat_devices;
    platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
    devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
  }
  cached = true;
 }
 void listDevices(void)
 {
  getDeviceList();
  // Print device names
  if (devices.size() == 0)
  {
    std::cerr << "No devices found." << std::endl;
  }
  else
  {
    std::cout << std::endl;
    std::cout << "Devices:" << std::endl;
    for (int i = 0; i < devices.size(); i++)
    {
      std::cout << i << ": " << getDeviceName(i) << std::endl;
    }
    std::cout << std::endl;
  }
 }
 std::string getDeviceName(const int device)
 {
  if (!cached)
    getDeviceList();
  std::string name;
  cl_device_info info = CL_DEVICE_NAME;
  if (device < devices.size())
  {
    devices[device].getInfo(info, &name);
  }
  else
  {
    throw std::runtime_error("Error asking for name for non-existant device");
  }
  return name;
 }
 std::string getDeviceDriver(const int device)
 {
  if (!cached)
    getDeviceList();
  std::string driver;
  if (device < devices.size())
  {
    devices[device].getInfo(CL_DRIVER_VERSION, &driver);
  }
  else
  {
    throw std::runtime_error("Error asking for driver for non-existant device");
  }
  return driver;
 }
 template class OCLStream<float>;
 template class OCLStream<double>;
--- a/OCLStream.h
+++ b/OCLStream.h
@ -0,0 +1,61 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #define CL_HPP_ENABLE_EXCEPTIONS
 #define CL_HPP_TARGET_OPENCL_VERSION 120
 #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 #include "CL/cl2.hpp"
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "OpenCL"
 template <class T>
 class OCLStream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Device side pointers to arrays
    cl::Buffer d_a;
    cl::Buffer d_b;
    cl::Buffer d_c;
    // OpenCL objects
    cl::Device device;
    cl::Context context;
    cl::CommandQueue queue;
    cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
    cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *triad_kernel;
  public:
    OCLStream(const unsigned int, const int);
    ~OCLStream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
 // Populate the devices list
 void getDeviceList(void);
--- a/OMP3Stream.cpp
+++ b/OMP3Stream.cpp
@ -0,0 +1,111 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "OMP3Stream.h"
 template <class T>
 OMP3Stream<T>::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
 {
  array_size = ARRAY_SIZE;
  this->a = (T*)malloc(sizeof(T)*array_size);
  this->b = (T*)malloc(sizeof(T)*array_size);
  this->c = (T*)malloc(sizeof(T)*array_size);
 }
 template <class T>
 OMP3Stream<T>::~OMP3Stream()
 {
  free(a);
  free(b);
  free(c);
 }
 template <class T>
 void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
 {
  #pragma omp parallel for
  for (int i = 0; i < array_size; i++)
  {
    a[i] = h_a[i];
    b[i] = h_b[i];
    c[i] = h_c[i];
  }
 }
 template <class T>
 void OMP3Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  #pragma omp parallel for
  for (int i = 0; i < array_size; i++)
  {
    h_a[i] = a[i];
    h_b[i] = b[i];
    h_c[i] = c[i];
  }
 }
 template <class T>
 void OMP3Stream<T>::copy()
 {
  #pragma omp parallel for
  for (int i = 0; i < array_size; i++)
  {
    c[i] = a[i];
  }
 }
 template <class T>
 void OMP3Stream<T>::mul()
 {
  const T scalar = 0.3;
  #pragma omp parallel for
  for (int i = 0; i < array_size; i++)
  {
    b[i] = scalar * c[i];
  }
 }
 template <class T>
 void OMP3Stream<T>::add()
 {
  #pragma omp parallel for
  for (int i = 0; i < array_size; i++)
  {
    c[i] = a[i] + b[i];
  }
 }
 template <class T>
 void OMP3Stream<T>::triad()
 {
  const T scalar = 0.3;
  #pragma omp parallel for
  for (int i = 0; i < array_size; i++)
  {
    a[i] = b[i] + scalar * c[i];
  }
 }
 void listDevices(void)
 {
  std::cout << "0: CPU" << std::endl;
 }
 std::string getDeviceName(const int)
 {
  return std::string("Device name unavailable");
 }
 std::string getDeviceDriver(const int)
 {
  return std::string("Device driver unavailable");
 }
 template class OMP3Stream<float>;
 template class OMP3Stream<double>;
--- a/OMP3Stream.h
+++ b/OMP3Stream.h
@ -0,0 +1,40 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "Reference OpenMP"
 template <class T>
 class OMP3Stream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Device side pointers
    T *a;
    T *b;
    T *c;
  public:
    OMP3Stream(const unsigned int, T*, T*, T*);
    ~OMP3Stream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/OMP45Stream.cpp
+++ b/OMP45Stream.cpp
@ -0,0 +1,140 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "OMP45Stream.h"
 template <class T>
 OMP45Stream<T>::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
 {
  omp_set_default_device(device);
  array_size = ARRAY_SIZE;
  // Set up data region on device
  this->a = a;
  this->b = b;
  this->c = c;
  #pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 OMP45Stream<T>::~OMP45Stream()
 {
  // End data region on device
  unsigned int array_size = this->array_size;
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 void OMP45Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
 {
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 void OMP45Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size])
  {}
 }
 template <class T>
 void OMP45Stream<T>::copy()
 {
  unsigned int array_size = this->array_size;
  T *a = this->a;
  T *c = this->c;
  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size])
  for (int i = 0; i < array_size; i++)
  {
    c[i] = a[i];
  }
 }
 template <class T>
 void OMP45Stream<T>::mul()
 {
  const T scalar = 0.3;
  unsigned int array_size = this->array_size;
  T *b = this->b;
  T *c = this->c;
  #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size])
  for (int i = 0; i < array_size; i++)
  {
    b[i] = scalar * c[i];
  }
 }
 template <class T>
 void OMP45Stream<T>::add()
 {
  unsigned int array_size = this->array_size;
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
  for (int i = 0; i < array_size; i++)
  {
    c[i] = a[i] + b[i];
  }
 }
 template <class T>
 void OMP45Stream<T>::triad()
 {
  const T scalar = 0.3;
  unsigned int array_size = this->array_size;
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
  for (int i = 0; i < array_size; i++)
  {
    a[i] = b[i] + scalar * c[i];
  }
 }
 void listDevices(void)
 {
  // Get number of devices
  int count = omp_get_num_devices();
  // Print device list
  if (count == 0)
  {
    std::cerr << "No devices found." << std::endl;
  }
  else
  {
    std::cout << "There are " << count << " devices." << std::endl;
  }
 }
 std::string getDeviceName(const int)
 {
  return std::string("Device name unavailable");
 }
 std::string getDeviceDriver(const int)
 {
  return std::string("Device driver unavailable");
 }
 template class OMP45Stream<float>;
 template class OMP45Stream<double>;
--- a/OMP45Stream.h
+++ b/OMP45Stream.h
@ -0,0 +1,45 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include "Stream.h"
 #include <omp.h>
 #define IMPLEMENTATION_STRING "OpenMP 4.5"
 template <class T>
 class OMP45Stream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Device side pointers
    T *a;
    T *b;
    T *c;
  public:
    OMP45Stream(const unsigned int, T*, T*, T*, int);
    ~OMP45Stream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -0,0 +1,130 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "RAJAStream.hpp"
 using RAJA::forall;
 using RAJA::RangeSegment;
 template <class T>
 RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
    : array_size(ARRAY_SIZE)
 {
  RangeSegment seg(0, ARRAY_SIZE);
  index_set.push_back(seg);
 #ifdef RAJA_TARGET_CPU
  d_a = new T[ARRAY_SIZE];
  d_b = new T[ARRAY_SIZE];
  d_c = new T[ARRAY_SIZE];
 #else
  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
  cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
  cudaDeviceSynchronize();
 #endif
 }
 template <class T>
 RAJAStream<T>::~RAJAStream()
 {
 #ifdef RAJA_TARGET_CPU
  delete[] d_a;
  delete[] d_b;
  delete[] d_c;
 #else
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
 #endif
 }
 template <class T>
 void RAJAStream<T>::write_arrays(
        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
  std::copy(a.begin(), a.end(), d_a);
  std::copy(b.begin(), b.end(), d_b);
  std::copy(c.begin(), c.end(), d_c);
 }
 template <class T>
 void RAJAStream<T>::read_arrays(
        std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
  std::copy(d_a, d_a + array_size, a.data());
  std::copy(d_b, d_b + array_size, b.data());
  std::copy(d_c, d_c + array_size, c.data());
 }
 template <class T>
 void RAJAStream<T>::copy()
 {
  T* a = d_a;
  T* c = d_c;
  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
  {
    c[index] = a[index];
  });
 }
 template <class T>
 void RAJAStream<T>::mul()
 {
  T* b = d_b;
  T* c = d_c;
  const T scalar = 0.3;
  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
  {
    b[index] = scalar*c[index];
  });
 }
 template <class T>
 void RAJAStream<T>::add()
 {
  T* a = d_a;
  T* b = d_b;
  T* c = d_c;
  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
  {
    c[index] = a[index] + b[index];
  });
 }
 template <class T>
 void RAJAStream<T>::triad()
 {
  T* a = d_a;
  T* b = d_b;
  T* c = d_c;
  const T scalar = 0.3;
  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
  {
    a[index] = b[index] + scalar*c[index];
  });
 }
 void listDevices(void)
 {
  std::cout << "This is not the device you are looking for.";
 }
 std::string getDeviceName(const int device)
 {
  return "RAJA";
 }
 std::string getDeviceDriver(const int device)
 {
  return "RAJA";
 }
 template class RAJAStream<float>;
 template class RAJAStream<double>;
--- a/RAJAStream.hpp
+++ b/RAJAStream.hpp
@ -0,0 +1,58 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include "RAJA/RAJA.hxx"
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "RAJA"
 #ifdef RAJA_TARGET_CPU
 typedef RAJA::IndexSet::ExecPolicy<
        RAJA::seq_segit,
        RAJA::omp_parallel_for_exec> policy;
 #else
 const size_t block_size = 128;
 typedef RAJA::IndexSet::ExecPolicy<
        RAJA::seq_segit,
        RAJA::cuda_exec<block_size>> policy;
 #endif
 template <class T>
 class RAJAStream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Contains iteration space
    RAJA::IndexSet index_set;
    // Device side pointers to arrays
    T* d_a;
    T* d_b;
    T* d_c;
  public:
    RAJAStream(const unsigned int, const int);
    ~RAJAStream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(
            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(
            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/README.md
+++ b/README.md
@ -6,24 +6,26 @@ This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for
 Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time.
 There are multiple implementations of this benchmark in a variety of programming models.
 Currently implemented are:
  - OpenCL
  - CUDA
  - OpenACC
  - OpenMP 3 and 4.5
  - Kokkos
  - RAJA
  - SYCL
 Usage
 -----
-Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5)
+CMake 3.2 or above is required.
 Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
-Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda`
+Generate the Makefile with `cmake .`
-For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
+Android (outdated instructions)
-http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
+------------------
 Install the HCC compiler:
 https://bitbucket.org/multicoreware/hcc/wiki/Home
 Install HIP:
 https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
 Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
 Android
 -------
 Assuming you have a recent Android NDK available, you can use the
 toolchain that it provides to build GPU-STREAM. You should first
@ -57,6 +59,7 @@ Run GPU-STREAM from an adb shell:
    # Use float if device doesn't support double, and reduce array size
    ./gpu-stream-ocl --float -n 6 -s 10000000
 Results
 -------
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@ -0,0 +1,231 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "SYCLStream.h"
 #include <iostream>
 using namespace cl::sycl;
 #define WGSIZE 64
 // Cache list of devices
 bool cached = false;
 std::vector<device> devices;
 void getDeviceList(void);
 template <class T>
 SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 {
  if (!cached)
    getDeviceList();
  // The array size must be divisible by WGSIZE
  if (ARRAY_SIZE % WGSIZE != 0)
  {
    std::stringstream ss;
    ss << "Array size must be a multiple of " << WGSIZE;
    throw std::runtime_error(ss.str());
  }
  array_size = ARRAY_SIZE;
  if (device_index >= devices.size())
    throw std::runtime_error("Invalid device index");
  device dev = devices[device_index];
  // Print out device information
  std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl;
  std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
  queue = new cl::sycl::queue(dev);
  // Create buffers
  d_a = new buffer<T>(array_size);
  d_b = new buffer<T>(array_size);
  d_c = new buffer<T>(array_size);
 }
 template <class T>
 SYCLStream<T>::~SYCLStream()
 {
  delete d_a;
  delete d_b;
  delete d_c;
  delete queue;
 }
 template <class T>
 void SYCLStream<T>::copy()
 {
  queue->submit([&](handler &cgh)
  {
    auto ka = d_a->template get_access<access::mode::read>(cgh);
    auto kc = d_c->template get_access<access::mode::write>(cgh);
    cgh.parallel_for<class copy>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
    {
      kc[item.get_global()] = ka[item.get_global()];
    });
  });
  queue->wait();
 }
 template <class T>
 void SYCLStream<T>::mul()
 {
  const T scalar = 0.3;
  queue->submit([&](handler &cgh)
  {
    auto kb = d_b->template get_access<access::mode::write>(cgh);
    auto kc = d_c->template get_access<access::mode::read>(cgh);
    cgh.parallel_for<class mul>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
    {
      kb[item.get_global()] = scalar * kc[item.get_global()];
    });
  });
  queue->wait();
 }
 template <class T>
 void SYCLStream<T>::add()
 {
  queue->submit([&](handler &cgh)
  {
    auto ka = d_a->template get_access<access::mode::read>(cgh);
    auto kb = d_b->template get_access<access::mode::read>(cgh);
    auto kc = d_c->template get_access<access::mode::write>(cgh);
    cgh.parallel_for<class add>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
    {
      kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()];
    });
  });
  queue->wait();
 }
 template <class T>
 void SYCLStream<T>::triad()
 {
  const T scalar = 0.3;
  queue->submit([&](handler &cgh)
  {
    auto ka = d_a->template get_access<access::mode::write>(cgh);
    auto kb = d_b->template get_access<access::mode::read>(cgh);
    auto kc = d_c->template get_access<access::mode::read>(cgh);
    cgh.parallel_for<class triad>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
    {
      ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()];
    });
  });
  queue->wait();
 }
 template <class T>
 void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
 {
  auto _a = d_a->template get_access<access::mode::write, access::target::host_buffer>();
  auto _b = d_b->template get_access<access::mode::write, access::target::host_buffer>();
  auto _c = d_c->template get_access<access::mode::write, access::target::host_buffer>();
  for (int i = 0; i < array_size; i++)
  {
    _a[i] = a[i];
    _b[i] = b[i];
    _c[i] = c[i];
  }
 }
 template <class T>
 void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
  auto _a = d_a->template get_access<access::mode::read, access::target::host_buffer>();
  auto _b = d_b->template get_access<access::mode::read, access::target::host_buffer>();
  auto _c = d_c->template get_access<access::mode::read, access::target::host_buffer>();
  for (int i = 0; i < array_size; i++)
  {
    a[i] = _a[i];
    b[i] = _b[i];
    c[i] = _c[i];
  }
 }
 void getDeviceList(void)
 {
  // Get list of platforms
  std::vector<platform> platforms = platform::get_platforms();
  // Enumerate devices
  for (unsigned i = 0; i < platforms.size(); i++)
  {
    std::vector<device> plat_devices = platforms[i].get_devices();
    devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
  }
  cached = true;
 }
 void listDevices(void)
 {
  getDeviceList();
  // Print device names
  if (devices.size() == 0)
  {
    std::cerr << "No devices found." << std::endl;
  }
  else
  {
    std::cout << std::endl;
    std::cout << "Devices:" << std::endl;
    for (int i = 0; i < devices.size(); i++)
    {
      std::cout << i << ": " << getDeviceName(i) << std::endl;
    }
    std::cout << std::endl;
  }
 }
 std::string getDeviceName(const int device)
 {
  if (!cached)
    getDeviceList();
  std::string name;
  if (device < devices.size())
  {
    name = devices[device].get_info<info::device::name>();
  }
  else
  {
    throw std::runtime_error("Error asking for name for non-existant device");
  }
  return name;
 }
 std::string getDeviceDriver(const int device)
 {
  if (!cached)
    getDeviceList();
  std::string driver;
  if (device < devices.size())
  {
    driver = devices[device].get_info<info::device::driver_version>();
  }
  else
  {
    throw std::runtime_error("Error asking for driver for non-existant device");
  }
  return driver;
 }
 // TODO: Fix kernel names to allow multiple template specializations
 //template class SYCLStream<float>;
 template class SYCLStream<double>;
--- a/SYCLStream.h
+++ b/SYCLStream.h
@ -0,0 +1,47 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <sstream>
 #include "Stream.h"
 #include "CL/sycl.hpp"
 #define IMPLEMENTATION_STRING "SYCL"
 template <class T>
 class SYCLStream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // SYCL objects
    cl::sycl::queue *queue;
    cl::sycl::buffer<T> *d_a;
    cl::sycl::buffer<T> *d_b;
    cl::sycl::buffer<T> *d_c;
  public:
    SYCLStream(const unsigned int, const int);
    ~SYCLStream();
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
 // Populate the devices list
 void getDeviceList(void);
--- a/Stream.h
+++ b/Stream.h
@ -0,0 +1,37 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <vector>
 #include <string>
 template <class T>
 class Stream
 {
  public:
    virtual ~Stream(){}
    // Kernels
    // These must be blocking calls
    virtual void copy() = 0;
    virtual void mul() = 0;
    virtual void add() = 0;
    virtual void triad() = 0;
    // Copy memory between host and device
    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
 };
 // Implementation specific device functions
 void listDevices(void);
 std::string getDeviceName(const int);
 std::string getDeviceDriver(const int);
--- a/common.cpp
+++ b/common.cpp
@ -1,165 +0,0 @@
 /*=============================================================================
 *------------------------------------------------------------------------------
 * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
 *  1. You are free to use this program and/or to redistribute
 *     this program.
 *  2. You are free to modify this program for your own use,
 *     including commercial use, subject to the publication
 *     restrictions in item 3.
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
 *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 *         published results must be in conformance to the GPU-STREAM
 *         Run Rules published at
 *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
 *         accordance with the GPU-STREAM Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
 *         "tuned GPU-STREAM benchmark results"
 *         "based on a variant of the GPU-STREAM benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
 *     3c. Submission of results to the GPU-STREAM benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
 *  5. Absolutely no warranty is expressed or implied.
 *———————————————————————————————————-----------------------------------------*/
 #include "common.h"
 // Default array size 50 * 2^20 (50*8 Mebibytes double precision)
 // Use binary powers of two so divides 1024
 unsigned int ARRAY_SIZE = 52428800;
 size_t   ARRAY_PAD_BYTES  = 0;
 unsigned int NTIMES = 10;
 bool useFloat = false;
 unsigned int  groups   = 0;
 unsigned int  groupSize   = 1024;
 unsigned int deviceIndex = 0;
 int parseUInt(const char *str, unsigned int *output)
 {
    char *next;
    *output = strtoul(str, &next, 10);
    return !strlen(next);
 }
 int parseSize(const char *str, size_t *output)
 {
    char *next;
    *output = strtoull(str, &next, 0);
 	int l = strlen(str);
 	if (l) {
 		char c = str[l-1]; // last char.
 		if ((c == 'k') || (c == 'K')) {
 			*output *= 1024;
 		}
 		if ((c == 'm') || (c == 'M')) {
 			*output *= (1024*1024);
 		}
 	}
    return !strlen(next);
 }
 void parseArguments(int argc, char *argv[])
 {
    for (int i = 1; i < argc; i++)
    {
        if (!strcmp(argv[i], "--list"))
        {
            listDevices();
            exit(0);
        }
        else if (!strcmp(argv[i], "--device"))
        {
            if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
            {
                std::cout << "Invalid device index" << std::endl;
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s"))
        {
            if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
            {
                std::cout << "Invalid array size" << std::endl;
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n"))
        {
            if (++i >= argc || !parseUInt(argv[i], &NTIMES))
            {
                std::cout << "Invalid number of times" << std::endl;
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--groups"))
        {
            if (++i >= argc || !parseUInt(argv[i], &groups))
            {
                std::cout << "Invalid group number" << std::endl;
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--groupSize"))
        {
            if (++i >= argc || !parseUInt(argv[i], &groupSize))
            {
                std::cout << "Invalid group size" << std::endl;
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--pad"))
        {
            if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
            {
                std::cout << "Invalid size" << std::endl;
                exit(1);
            }
        }
        else if (!strcmp(argv[i], "--float"))
        {
            useFloat = true;
            std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision, not apply to AMD device" << std::endl;
        }
        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
        {
            std::cout << std::endl;
            std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl;
            std::cout << "Options:" << std::endl;
            std::cout << "  -h  --help               Print the message" << std::endl;
            std::cout << "      --list               List available devices" << std::endl;
            std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
            std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
            std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
            std::cout << "      --groups             Set number of groups to launch -  each work-item proceses multiple array items" << std::endl;
            std::cout << "      --groupSize          Set size of each group (default 1024)" << std::endl;
            std::cout << "      --pad                Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
            std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
            std::cout << std::endl;
            exit(0);
        }
        else
        {
            std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')"
                << std::endl;
            exit(1);
        }
    }
 }
--- a/common.h
+++ b/common.h
@ -1,115 +0,0 @@
 /*=============================================================================
 *------------------------------------------------------------------------------
 * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
 *  1. You are free to use this program and/or to redistribute
 *     this program.
 *  2. You are free to modify this program for your own use,
 *     including commercial use, subject to the publication
 *     restrictions in item 3.
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
 *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 *         published results must be in conformance to the GPU-STREAM
 *         Run Rules published at
 *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
 *         accordance with the GPU-STREAM Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
 *         "tuned GPU-STREAM benchmark results"
 *         "based on a variant of the GPU-STREAM benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
 *     3c. Submission of results to the GPU-STREAM benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
 *  5. Absolutely no warranty is expressed or implied.
 *———————————————————————————————————-----------------------------------------*/
 #include <iomanip>
 #include <iostream>
 #include <cstdlib>
 #include <cstring>
 #include <limits>
 #include <stdexcept>
 #define VERSION_STRING "1.0"
 extern void parseArguments(int argc, char *argv[]);
 extern void listDevices(void);
 extern unsigned int ARRAY_SIZE;
 extern size_t       ARRAY_PAD_BYTES;
 extern unsigned int NTIMES;
 extern unsigned int  groups;
 extern unsigned int  groupSize;
 extern bool useFloat;
 extern unsigned int deviceIndex;
 template < typename T >
 void check_solution(void* a_in, void* b_in, void* c_in)
 {
    // Generate correct solution
    T golda = 1.0;
    T goldb = 2.0;
    T goldc = 0.0;
    T * a = static_cast<T*>(a_in);
    T * b = static_cast<T*>(b_in);
    T * c = static_cast<T*>(c_in);
    const T scalar = 3.0;
    for (unsigned int i = 0; i < NTIMES; i++)
    {
        // Double
        goldc = golda;
        goldb = scalar * goldc;
        goldc = golda + goldb;
        golda = goldb + scalar * goldc;
    }
    // Calculate average error
    double erra = 0.0;
    double errb = 0.0;
    double errc = 0.0;
    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
    {
        erra += fabs(a[i] - golda);
        errb += fabs(b[i] - goldb);
        errc += fabs(c[i] - goldc);
    }
    erra /= ARRAY_SIZE;
    errb /= ARRAY_SIZE;
    errc /= ARRAY_SIZE;
    double epsi = std::numeric_limits<T>::epsilon() * 100;
    if (erra > epsi)
        std::cout
            << "Validation failed on a[]. Average error " << erra
            << std::endl;
    if (errb > epsi)
        std::cout
            << "Validation failed on b[]. Average error " << errb
            << std::endl;
    if (errc > epsi)
        std::cout
            << "Validation failed on c[]. Average error " << errc
            << std::endl;
 }
--- a/common.h.in
+++ b/common.h.in
@ -0,0 +1,9 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@"
--- a/cuda-stream.cu
+++ b/cuda-stream.cu
@ -1,507 +0,0 @@
 /*=============================================================================
 *------------------------------------------------------------------------------
 * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
 *  1. You are free to use this program and/or to redistribute
 *     this program.
 *  2. You are free to modify this program for your own use,
 *     including commercial use, subject to the publication
 *     restrictions in item 3.
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
 *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 *         published results must be in conformance to the GPU-STREAM
 *         Run Rules published at
 *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
 *         accordance with the GPU-STREAM Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
 *         "tuned GPU-STREAM benchmark results"
 *         "based on a variant of the GPU-STREAM benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
 *     3c. Submission of results to the GPU-STREAM benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
 *  5. Absolutely no warranty is expressed or implied.
 *———————————————————————————————————-----------------------------------------*/
 #include <iostream>
 #include <fstream>
 #include <vector>
 #include <chrono>
 #include <cfloat>
 #include <cmath>
 #include <cuda.h>
 #include "common.h"
 std::string getDeviceName(int device);
 int getDriver(void);
 // Code to check CUDA errors
 void check_cuda_error(void)
 {
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        std::cerr
            << "Error: "
            << cudaGetErrorString(err)
            << std::endl;
            exit(err);
    }
 }
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
 template <typename T>
 __global__ void
 copy_looper(const T * a, T * c, int ARRAY_SIZE)
 {
    int offset = (blockDim.x * blockIdx.x + threadIdx.x);
    int stride = blockDim.x * gridDim.x;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        c[i] = a[i];
    }
 }
 template <typename T>
 __global__ void
 mul_looper(T * b, const T * c, int ARRAY_SIZE)
 {
    int offset = blockDim.x * blockIdx.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    const T scalar = 3.0;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        b[i] = scalar * c[i];
    }
 }
 template <typename T>
 __global__ void
 add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE)
 {
    int offset = blockDim.x * blockIdx.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        c[i] = a[i] + b[i];
    }
 }
 template <typename T>
 __global__ void
 triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE)
 {
    int offset = blockDim.x * blockIdx.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    const T scalar = 3.0;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        a[i] = b[i] + scalar * c[i];
    }
 }
 template <typename T>
 __global__ void copy(const T * a, T * c)
 {
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    c[i] = a[i];
 }
 template <typename T>
 __global__ void mul(T * b, const T * c)
 {
    const T scalar = 3.0;
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    b[i] = scalar * c[i];
 }
 template <typename T>
 __global__ void add(const T * a, const T * b, T * c)
 {
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    c[i] = a[i] + b[i];
 }
 template <typename T>
 __global__ void triad(T * a, const T * b, const T * c)
 {
    const T scalar = 3.0;
    const int i = blockDim.x * blockIdx.x + threadIdx.x;
    a[i] = b[i] + scalar * c[i];
 }
 int main(int argc, char *argv[])
 {
    // Print out run information
    std::cout
        << "GPU-STREAM" << std::endl
        << "Version: " << VERSION_STRING << std::endl
        << "Implementation: CUDA" << std::endl;
    parseArguments(argc, argv);
    if (NTIMES < 2)
        throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
    // Config grid size and group size for kernel launching
    int gridSize;
    if (groups) {
        gridSize = groups * groupSize;
    } else  {
        gridSize = ARRAY_SIZE;
    }
    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
    if (groups) std::cout << "Using looper kernels:" << std::endl;
    std::cout << "Precision: ";
    if (useFloat) std::cout << "float";
    else std::cout << "double";
    std::cout << std::endl << std::endl;
    std::cout << "Running kernels " << NTIMES << " times" << std::endl;
    if (ARRAY_SIZE % 1024 != 0)
    {
        unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
        ARRAY_SIZE -= ARRAY_SIZE % 1024;
        std::cout
            << "Warning: array size must divide 1024" << std::endl
            << "Resizing array from " << OLD_ARRAY_SIZE
            << " to " << ARRAY_SIZE << std::endl;
        if (ARRAY_SIZE == 0)
            throw std::runtime_error("Array size must be >= 1024");
    }
    // Get precision (used to reset later)
    std::streamsize ss = std::cout.precision();
    size_t DATATYPE_SIZE;
    if (useFloat)
    {
        DATATYPE_SIZE = sizeof(float);
    }
    else
    {
        DATATYPE_SIZE = sizeof(double);
    }
    // Display number of bytes in array
    std::cout << std::setprecision(1) << std::fixed
        << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
        << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
        << std::endl;
    std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
        << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
        << std::endl;
    // Reset precision
    std::cout.precision(ss);
    // Check device index is in range
    int count;
    cudaGetDeviceCount(&count);
    check_cuda_error();
    if (deviceIndex >= count)
        throw std::runtime_error("Chosen device index is invalid");
    cudaSetDevice(deviceIndex);
    check_cuda_error();
    // Print out device name
    std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
    // Print out device CUDA driver version
    std::cout << "Driver: " << getDriver() << std::endl;
    // Check buffers fit on the device
    cudaDeviceProp props;
    cudaGetDeviceProperties(&props, deviceIndex);
    if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
    // Create host vectors
    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
    // Initilise arrays
    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
    {
        if (useFloat)
        {
            ((float*)h_a)[i] = 1.0f;
            ((float*)h_b)[i] = 2.0f;
            ((float*)h_c)[i] = 0.0f;
        }
        else
        {
            ((double*)h_a)[i] = 1.0;
            ((double*)h_b)[i] = 2.0;
            ((double*)h_c)[i] = 0.0;
        }
    }
    // Create device buffers
    void * d_a, * d_b, *d_c;
    cudaMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
    check_cuda_error();
    cudaMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
    check_cuda_error();
    cudaMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
    check_cuda_error();
    // Copy host memory to device
    cudaMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
    check_cuda_error();
    cudaMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
    check_cuda_error();
    cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
    check_cuda_error();
    std::cout << "d_a=" << (void*)d_a << std::endl;
 	std::cout << "d_b=" << (void*)d_b << std::endl;
 	std::cout << "d_c=" << (void*)d_c << std::endl;
    // Make sure the copies are finished
    cudaDeviceSynchronize();
    check_cuda_error();
    // List of times
    std::vector< std::vector<double> > timings;
    // Declare timers
    std::chrono::high_resolution_clock::time_point t1, t2;
    // Main loop
    for (unsigned int k = 0; k < NTIMES; k++)
    {
        std::vector<double> times;
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                copy_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
            else
                copy_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
            else
                copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
        }
        check_cuda_error();
        cudaDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                mul_looper<float><<<gridSize,groupSize>>>((float*)d_b, (float*)d_c, ARRAY_SIZE);
            else
                mul_looper<double><<<gridSize,groupSize>>>((double*)d_b, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
            else
                mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
        }
        check_cuda_error();
        cudaDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                add_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
            else
                add_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
            else
                add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
        }
        check_cuda_error();
        cudaDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                triad_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
            else
                triad_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
            else
                triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
        }
        check_cuda_error();
        cudaDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        timings.push_back(times);
    }
    // Check solutions
    cudaMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
    check_cuda_error();
    cudaMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
    check_cuda_error();
    cudaMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
    check_cuda_error();
    if (useFloat)
    {
        check_solution<float>(h_a, h_b, h_c);
    }
    else
    {
        check_solution<double>(h_a, h_b, h_c);
    }
    // Crunch results
    size_t sizes[4] = {
        2 * DATATYPE_SIZE * ARRAY_SIZE,
        2 * DATATYPE_SIZE * ARRAY_SIZE,
        3 * DATATYPE_SIZE * ARRAY_SIZE,
        3 * DATATYPE_SIZE * ARRAY_SIZE
    };
    double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
    double max[4] = {0.0, 0.0, 0.0, 0.0};
    double avg[4] = {0.0, 0.0, 0.0, 0.0};
    // Ignore first result
    for (unsigned int i = 1; i < NTIMES; i++)
    {
        for (int j = 0; j < 4; j++)
        {
            avg[j] += timings[i][j];
            min[j] = std::min(min[j], timings[i][j]);
            max[j] = std::max(max[j], timings[i][j]);
        }
    }
    for (int j = 0; j < 4; j++)
        avg[j] /= (double)(NTIMES-1);
    double geomean = 1.0;
    for (int j = 0; j < 4; j++) {
        geomean *= (sizes[j]/min[j]);
    }
    geomean = pow(geomean, 0.25);
    // Display results
    std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
    std::cout
        << std::left << std::setw(12) << "Function"
        << std::left << std::setw(12) << "MBytes/sec"
        << std::left << std::setw(12) << "Min (sec)"
        << std::left << std::setw(12) << "Max"
        << std::left << std::setw(12) << "Average"
        << std::endl;
    for (int j = 0; j < 4; j++)
    {
        std::cout
            << std::left << std::setw(12) << labels[j]
            << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
            << std::left << std::setw(12) << std::setprecision(5) << min[j]
            << std::left << std::setw(12) << std::setprecision(5) << max[j]
            << std::left << std::setw(12) << std::setprecision(5) << avg[j]
            << std::endl;
    }
    std::cout
        << std::left << std::setw(12) << "GEOMEAN"
        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
        << std::endl;
    // Free host vectors
    free(h_a);
    free(h_b);
    free(h_c);
    // Free cuda buffers
    cudaFree(d_a);
    check_cuda_error();
    cudaFree(d_b);
    check_cuda_error();
    cudaFree(d_c);
    check_cuda_error();
 }
 std::string getDeviceName(int device)
 {
    struct cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device);
    check_cuda_error();
    return std::string(prop.name);
 }
 int getDriver(void)
 {
    int driver;
    cudaDriverGetVersion(&driver);
    check_cuda_error();
    return driver;
 }
 void listDevices(void)
 {
    // Get number of devices
    int count;
    cudaGetDeviceCount(&count);
    check_cuda_error();
    // Print device names
    if (count == 0)
    {
        std::cout << "No devices found." << std::endl;
    }
    else
    {
        std::cout << std::endl;
        std::cout << "Devices:" << std::endl;
        for (int i = 0; i < count; i++)
        {
            std::cout << i << ": " << getDeviceName(i) << std::endl;
            check_cuda_error();
        }
        std::cout << std::endl;
    }
 }
--- a/hip-stream.cpp
+++ b/hip-stream.cpp
@ -1,531 +0,0 @@
 #include "hip_runtime.h"
 /*=============================================================================
 *------------------------------------------------------------------------------
 * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
 *  1. You are free to use this program and/or to redistribute
 *     this program.
 *  2. You are free to modify this program for your own use,
 *     including commercial use, subject to the publication
 *     restrictions in item 3.
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
 *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 *         published results must be in conformance to the GPU-STREAM
 *         Run Rules published at
 *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
 *         accordance with the GPU-STREAM Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
 *         "tuned GPU-STREAM benchmark results"
 *         "based on a variant of the GPU-STREAM benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
 *     3c. Submission of results to the GPU-STREAM benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
 *  5. Absolutely no warranty is expressed or implied.
 *———————————————————————————————————-----------------------------------------*/
 #include <iostream>
 #include <fstream>
 #include <vector>
 #include <chrono>
 #include <cfloat>
 #include <cmath>
 //#include <cuda.h>
 #include "common.h"
 std::string getDeviceName(int device);
 int getDriver(void);
 // Code to check CUDA errors
 void check_cuda_error(void)
 {
    hipError_t err = hipGetLastError();
    if (err != hipSuccess)
    {
        std::cerr
            << "Error: "
            << hipGetErrorString(err)
            << std::endl;
            exit(err);
    }
 }
 // looper function place more work inside each work item.
 // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
 template <typename T>
 __global__ void
 copy_looper(hipLaunchParm lp,  const T * a, T * c, int ARRAY_SIZE)
 {
    int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
    int stride = hipBlockDim_x * hipGridDim_x;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        c[i] = a[i];
    }
 }
 template <typename T>
 __global__ void
 mul_looper(hipLaunchParm lp,  T * b, const T * c, int ARRAY_SIZE)
 {
    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    int stride = hipBlockDim_x * hipGridDim_x;
    const T scalar = 3.0;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        b[i] = scalar * c[i];
    }
 }
 template <typename T>
 __global__ void
 add_looper(hipLaunchParm lp,  const T * a, const T * b, T * c, int ARRAY_SIZE)
 {
    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    int stride = hipBlockDim_x * hipGridDim_x;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        c[i] = a[i] + b[i];
    }
 }
 template <typename T>
 __global__ void
 triad_looper(hipLaunchParm lp,  T * a, const T * b, const T * c, int ARRAY_SIZE)
 {
    int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
    int stride = hipBlockDim_x * hipGridDim_x;
    const T scalar = 3.0;
    for (int i=offset; i<ARRAY_SIZE; i+=stride) {
        a[i] = b[i] + scalar * c[i];
    }
 }
 template <typename T>
 __global__ void
 copy(hipLaunchParm lp,  const T * a, T * c)
 {
    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    c[i] = a[i];
 }
 template <typename T>
 __global__ void
 mul(hipLaunchParm lp,  T * b, const T * c)
 {
    const T scalar = 3.0;
    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    b[i] = scalar * c[i];
 }
 template <typename T>
 __global__ void
 add(hipLaunchParm lp,  const T * a, const T * b, T * c)
 {
    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    c[i] = a[i] + b[i];
 }
 template <typename T>
 __global__ void
 triad(hipLaunchParm lp,  T * a, const T * b, const T * c)
 {
    const T scalar = 3.0;
    const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    a[i] = b[i] + scalar * c[i];
 }
 int main(int argc, char *argv[])
 {
    // Print out run information
    std::cout
        << "GPU-STREAM" << std::endl
        << "Version: " << VERSION_STRING << std::endl
        << "Implementation: HIP" << std::endl;
    parseArguments(argc, argv);
    if (NTIMES < 2)
        throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
    // Config grid size and group size for kernel launching
    int gridSize;
    if (groups) {
        gridSize = groups * groupSize;
    } else  {
        gridSize = ARRAY_SIZE;
    }
    float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
    std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
    std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
    std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
    if (groups) std::cout << "Using looper kernels:" << std::endl;
    std::cout << "Precision: ";
    if (useFloat) std::cout << "float";
    else std::cout << "double";
    std::cout << std::endl << std::endl;
    std::cout << "Running kernels " << NTIMES << " times" << std::endl;
    if (ARRAY_SIZE % 1024 != 0)
    {
        unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
        ARRAY_SIZE -= ARRAY_SIZE % 1024;
        std::cout
            << "Warning: array size must divide 1024" << std::endl
            << "Resizing array from " << OLD_ARRAY_SIZE
            << " to " << ARRAY_SIZE << std::endl;
        if (ARRAY_SIZE == 0)
            throw std::runtime_error("Array size must be >= 1024");
    }
    // Get precision (used to reset later)
    std::streamsize ss = std::cout.precision();
    size_t DATATYPE_SIZE;
    if (useFloat)
    {
        DATATYPE_SIZE = sizeof(float);
    }
    else
    {
        DATATYPE_SIZE = sizeof(double);
    }
    // Display number of bytes in array
    std::cout << std::setprecision(1) << std::fixed
        << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
        << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
 		<< " " << ARRAY_PAD_BYTES << " bytes padding"
        << std::endl;
    std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
        << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
        << std::endl;
    // Reset precision
    std::cout.precision(ss);
    // Check device index is in range
    int count;
    hipGetDeviceCount(&count);
    check_cuda_error();
    if (deviceIndex >= count)
        throw std::runtime_error("Chosen device index is invalid");
    hipSetDevice(deviceIndex);
    check_cuda_error();
    hipDeviceProp_t props;
    hipGetDeviceProperties(&props, deviceIndex);
    // Print out device name
    std::cout << "Using HIP device " << getDeviceName(deviceIndex) <<  " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
    // Print out device HIP driver version
    std::cout << "Driver: " << getDriver() << std::endl;
    // Check buffers fit on the device
    if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
    //int cus = props.multiProcessorCount;
    // Create host vectors
    void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
    void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
    void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
    // Initialise arrays
    for (unsigned int i = 0; i < ARRAY_SIZE; i++)
    {
        if (useFloat)
        {
            ((float*)h_a)[i] = 1.0f;
            ((float*)h_b)[i] = 2.0f;
            ((float*)h_c)[i] = 0.0f;
        }
        else
        {
            ((double*)h_a)[i] = 1.0;
            ((double*)h_b)[i] = 2.0;
            ((double*)h_c)[i] = 0.0;
        }
    }
    // Create device buffers
    char * d_a, * d_b, *d_c;
    hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
    check_cuda_error();
    hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
    d_b += ARRAY_PAD_BYTES;
    check_cuda_error();
    hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
    d_c += ARRAY_PAD_BYTES;
    check_cuda_error();
    // Copy host memory to device
    hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
    check_cuda_error();
    hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
    check_cuda_error();
    hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
    check_cuda_error();
    std::cout << "d_a=" << (void*)d_a << std::endl;
 	std::cout << "d_b=" << (void*)d_b << std::endl;
 	std::cout << "d_c=" << (void*)d_c << std::endl;
    // Make sure the copies are finished
    hipDeviceSynchronize();
    check_cuda_error();
    // List of times
    std::vector< std::vector<double> > timings;
    // Declare timers
    std::chrono::high_resolution_clock::time_point t1, t2;
    // Main loop
    for (unsigned int k = 0; k < NTIMES; k++)
    {
        std::vector<double> times;
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
        }
        check_cuda_error();
        hipDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
        }
        check_cuda_error();
        hipDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
        }
        check_cuda_error();
        hipDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        t1 = std::chrono::high_resolution_clock::now();
        if (groups) {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
        } else {
            if (useFloat)
                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
            else
                hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
        }
        check_cuda_error();
        hipDeviceSynchronize();
        check_cuda_error();
        t2 = std::chrono::high_resolution_clock::now();
        times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
        timings.push_back(times);
    }
    // Check solutions
    hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
    check_cuda_error();
    hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
    check_cuda_error();
    hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
    check_cuda_error();
    if (useFloat)
    {
        check_solution<float>(h_a, h_b, h_c);
    }
    else
    {
        check_solution<double>(h_a, h_b, h_c);
    }
    // Crunch results
    size_t sizes[4] = {
        2 * DATATYPE_SIZE * ARRAY_SIZE,
        2 * DATATYPE_SIZE * ARRAY_SIZE,
        3 * DATATYPE_SIZE * ARRAY_SIZE,
        3 * DATATYPE_SIZE * ARRAY_SIZE
    };
    double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
    double max[4] = {0.0, 0.0, 0.0, 0.0};
    double avg[4] = {0.0, 0.0, 0.0, 0.0};
    // Ignore first result
    for (unsigned int i = 1; i < NTIMES; i++)
    {
        for (int j = 0; j < 4; j++)
        {
            avg[j] += timings[i][j];
            min[j] = std::min(min[j], timings[i][j]);
            max[j] = std::max(max[j], timings[i][j]);
        }
    }
    for (int j = 0; j < 4; j++) {
        avg[j] /= (double)(NTIMES-1);
    }
    double geomean = 1.0;
    for (int j = 0; j < 4; j++) {
        geomean *= (sizes[j]/min[j]);
    }
    geomean = pow(geomean, 0.25);
    // Display results
    std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
    std::cout
        << std::left << std::setw(12) << "Function"
        << std::left << std::setw(12) << "MBytes/sec"
        << std::left << std::setw(12) << "Min (sec)"
        << std::left << std::setw(12) << "Max"
        << std::left << std::setw(12) << "Average"
        << std::endl;
    for (int j = 0; j < 4; j++)
    {
        std::cout
            << std::left << std::setw(12) << labels[j]
            << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
            << std::left << std::setw(12) << std::setprecision(5) << min[j]
            << std::left << std::setw(12) << std::setprecision(5) << max[j]
            << std::left << std::setw(12) << std::setprecision(5) << avg[j]
            << std::endl;
    }
    std::cout
        << std::left << std::setw(12) << "GEOMEAN"
        << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
        << std::endl;
    // Free host vectors
    free(h_a);
    free(h_b);
    free(h_c);
    // Free cuda buffers
    hipFree(d_a);
    check_cuda_error();
    hipFree(d_b);
    check_cuda_error();
    hipFree(d_c);
    check_cuda_error();
 }
 std::string getDeviceName(int device)
 {
    struct hipDeviceProp_t prop;
    hipGetDeviceProperties(&prop, device);
    check_cuda_error();
    return std::string(prop.name);
 }
 int getDriver(void)
 {
    int driver;
    hipDriverGetVersion(&driver);
    check_cuda_error();
    return driver;
 }
 void listDevices(void)
 {
    // Get number of devices
    int count;
    hipGetDeviceCount(&count);
    check_cuda_error();
    // Print device names
    if (count == 0)
    {
        std::cout << "No devices found." << std::endl;
    }
    else
    {
        std::cout << std::endl;
        std::cout << "Devices:" << std::endl;
        for (int i = 0; i < count; i++)
        {
            std::cout << i << ": " << getDeviceName(i) << std::endl;
            check_cuda_error();
        }
        std::cout << std::endl;
    }
 }
--- a/main.cpp
+++ b/main.cpp
@ -0,0 +1,332 @@
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include <iostream>
 #include <vector>
 #include <numeric>
 #include <cmath>
 #include <limits>
 #include <chrono>
 #include <algorithm>
 #include <iomanip>
 #include <cstring>
 #include "common.h"
 #include "Stream.h"
 #if defined(CUDA)
 #include "CUDAStream.h"
 #elif defined(OCL)
 #include "OCLStream.h"
 #elif defined(USE_RAJA)
 #include "RAJAStream.hpp"
 #elif defined(KOKKOS)
 #include "KOKKOSStream.hpp"
 #elif defined(ACC)
 #include "ACCStream.h"
 #elif defined(SYCL)
 #include "SYCLStream.h"
 #elif defined(OMP3)
 #include "OMP3Stream.h"
 #elif defined(OMP45)
 #include "OMP45Stream.h"
 #endif
 // Default size of 2^25
 unsigned int ARRAY_SIZE = 33554432;
 unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
 bool use_float = false;
 template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
 template <typename T>
 void run();
 void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
 {
  std::cout
    << "GPU-STREAM" << std::endl
    << "Version: " << VERSION_STRING << std::endl
    << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
  parseArguments(argc, argv);
  // TODO: Fix SYCL to allow multiple template specializations
 #ifndef SYCL
 #ifndef KOKKOS
  if (use_float)
    run<float>();
  else
 #endif
 #endif
    run<double>();
 }
 template <typename T>
 void run()
 {
  std::cout << "Running kernels " << num_times << " times" << std::endl;
  if (sizeof(T) == sizeof(float))
    std::cout << "Precision: float" << std::endl;
  else
    std::cout << "Precision: double" << std::endl;
  // Create host vectors
  std::vector<T> a(ARRAY_SIZE, 0.1);
  std::vector<T> b(ARRAY_SIZE, 0.2);
  std::vector<T> c(ARRAY_SIZE, 0.0);
  std::streamsize ss = std::cout.precision();
  std::cout << std::setprecision(1) << std::fixed
    << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
    << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
  std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
    << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
  std::cout.precision(ss);
  Stream<T> *stream;
 #if defined(CUDA)
  // Use the CUDA implementation
  stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(OCL)
  // Use the OpenCL implementation
  stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(USE_RAJA)
  // Use the RAJA implementation
  stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(KOKKOS)
  // Use the Kokkos implementation
  stream = new KOKKOSStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 #elif defined(SYCL)
  // Use the SYCL implementation
  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(OMP3)
  // Use the "reference" OpenMP 3 implementation
  stream = new OMP3Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
 #elif defined(OMP45)
  // Use the "reference" OpenMP 3 implementation
  stream = new OMP45Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 #endif
  stream->write_arrays(a, b, c);
  // List of times
  std::vector<std::vector<double>> timings(4);
  // Declare timers
  std::chrono::high_resolution_clock::time_point t1, t2;
  // Main loop
  for (unsigned int k = 0; k < num_times; k++)
  {
    // Execute Copy
    t1 = std::chrono::high_resolution_clock::now();
    stream->copy();
    t2 = std::chrono::high_resolution_clock::now();
    timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Mul
    t1 = std::chrono::high_resolution_clock::now();
    stream->mul();
    t2 = std::chrono::high_resolution_clock::now();
    timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Add
    t1 = std::chrono::high_resolution_clock::now();
    stream->add();
    t2 = std::chrono::high_resolution_clock::now();
    timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
    // Execute Triad
    t1 = std::chrono::high_resolution_clock::now();
    stream->triad();
    t2 = std::chrono::high_resolution_clock::now();
    timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
  }
  // Check solutions
  stream->read_arrays(a, b, c);
  check_solution<T>(num_times, a, b, c);
  // Display timing results
  std::cout
    << std::left << std::setw(12) << "Function"
    << std::left << std::setw(12) << "MBytes/sec"
    << std::left << std::setw(12) << "Min (sec)"
    << std::left << std::setw(12) << "Max"
    << std::left << std::setw(12) << "Average" << std::endl;
  std::cout << std::fixed;
  std::string labels[4] = {"Copy", "Mul", "Add", "Triad"};
  size_t sizes[4] = {
    2 * sizeof(T) * ARRAY_SIZE,
    2 * sizeof(T) * ARRAY_SIZE,
    3 * sizeof(T) * ARRAY_SIZE,
    3 * sizeof(T) * ARRAY_SIZE
  };
  for (int i = 0; i < 4; i++)
  {
    // Get min/max; ignore the first result
    auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
    // Calculate average; ignore the first result
    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
    // Display results
    std::cout
      << std::left << std::setw(12) << labels[i]
      << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first)
      << std::left << std::setw(12) << std::setprecision(5) << *minmax.first
      << std::left << std::setw(12) << std::setprecision(5) << *minmax.second
      << std::left << std::setw(12) << std::setprecision(5) << average
      << std::endl;
  }
  delete stream;
 }
 template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
 {
  // Generate correct solution
  T goldA = 0.1;
  T goldB = 0.2;
  T goldC = 0.0;
  const T scalar = 0.3;
  for (unsigned int i = 0; i < ntimes; i++)
  {
    // Do STREAM!
    goldC = goldA;
    goldB = scalar * goldC;
    goldC = goldA + goldB;
    goldA = goldB + scalar * goldC;
  }
  // Calculate the average error
  double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
  errA /= a.size();
  double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
  errB /= b.size();
  double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
  errC /= c.size();
  double epsi = std::numeric_limits<T>::epsilon() * 100.0;
  if (errA > epsi)
    std::cerr
      << "Validation failed on a[]. Average error " << errA
      << std::endl;
  if (errB > epsi)
    std::cerr
      << "Validation failed on b[]. Average error " << errB
      << std::endl;
  if (errC > epsi)
    std::cerr
      << "Validation failed on c[]. Average error " << errC
      << std::endl;
 }
 int parseUInt(const char *str, unsigned int *output)
 {
  char *next;
  *output = strtoul(str, &next, 10);
  return !strlen(next);
 }
 void parseArguments(int argc, char *argv[])
 {
  for (int i = 1; i < argc; i++)
  {
    if (!std::string("--list").compare(argv[i]))
    {
      listDevices();
      exit(EXIT_SUCCESS);
    }
    else if (!std::string("--device").compare(argv[i]))
    {
      if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
      {
        std::cerr << "Invalid device index." << std::endl;
        exit(EXIT_FAILURE);
      }
    }
    else if (!std::string("--arraysize").compare(argv[i]) ||
             !std::string("-s").compare(argv[i]))
    {
      if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
      {
        std::cerr << "Invalid array size." << std::endl;
        exit(EXIT_FAILURE);
      }
    }
    else if (!std::string("--numtimes").compare(argv[i]) ||
             !std::string("-n").compare(argv[i]))
    {
      if (++i >= argc || !parseUInt(argv[i], &num_times))
      {
        std::cerr << "Invalid number of times." << std::endl;
        exit(EXIT_FAILURE);
      }
      if (num_times < 2)
      {
        std::cerr << "Number of times must be 2 or more" << std::endl;
        exit(EXIT_FAILURE);
      }
    }
    else if (!std::string("--float").compare(argv[i]))
    {
      use_float = true;
    }
    else if (!std::string("--help").compare(argv[i]) ||
             !std::string("-h").compare(argv[i]))
    {
      std::cout << std::endl;
      std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
      std::cout << "Options:" << std::endl;
      std::cout << "  -h  --help               Print the message" << std::endl;
      std::cout << "      --list               List available devices" << std::endl;
      std::cout << "      --device     INDEX   Select device at INDEX" << std::endl;
      std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
      std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
      std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
      std::cout << std::endl;
      exit(EXIT_SUCCESS);
    }
    else
    {
      std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')"
                << std::endl;
      exit(EXIT_FAILURE);
    }
  }
 }
--- a/ocl-stream-kernels.cl
+++ b/ocl-stream-kernels.cl
@ -1,70 +0,0 @@
 /*=============================================================================
 *------------------------------------------------------------------------------
 * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
 *  1. You are free to use this program and/or to redistribute
 *     this program.
 *  2. You are free to modify this program for your own use,
 *     including commercial use, subject to the publication
 *     restrictions in item 3.
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
 *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 *         published results must be in conformance to the GPU-STREAM
 *         Run Rules published at
 *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
 *         accordance with the GPU-STREAM Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
 *         "tuned GPU-STREAM benchmark results"
 *         "based on a variant of the GPU-STREAM benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
 *     3c. Submission of results to the GPU-STREAM benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
 *  5. Absolutely no warranty is expressed or implied.
 *———————————————————————————————————-----------------------------------------*/
 #ifdef FLOAT
 	#define DATATYPE float
 	constant DATATYPE scalar = 3.0f;
 #else
 	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
 	#define DATATYPE double
 	constant DATATYPE scalar = 3.0;
 #endif
 kernel void copy(global const DATATYPE * restrict a, global DATATYPE * restrict c)
 {
 	const size_t i = get_global_id(0);
 	c[i] = a[i];
 }
 kernel void mul(global DATATYPE * restrict b, global const DATATYPE * restrict c)
 {
 	const size_t i = get_global_id(0);
 	b[i] = scalar * c[i];
 }
 kernel void add(global const DATATYPE * restrict a, global const DATATYPE * restrict b, global DATATYPE * restrict c)
 {
 	const size_t i = get_global_id(0);
 	c[i] = a[i] + b[i];
 }
 kernel void triad(global DATATYPE * restrict a, global const DATATYPE * restrict b, global const DATATYPE * restrict c)
 {
 	const size_t i = get_global_id(0);
 	a[i] = b[i] + scalar * c[i];
 }
--- a/ocl-stream.cpp
+++ b/ocl-stream.cpp
@ -1,488 +0,0 @@
 /*=============================================================================
 *------------------------------------------------------------------------------
 * Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
 * Based on John D. McCalpin’s original STREAM benchmark for CPUs
 *------------------------------------------------------------------------------
 * License:
 *  1. You are free to use this program and/or to redistribute
 *     this program.
 *  2. You are free to modify this program for your own use,
 *     including commercial use, subject to the publication
 *     restrictions in item 3.
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
 *     3a. In order to be referred to as "GPU-STREAM benchmark results",
 *         published results must be in conformance to the GPU-STREAM
 *         Run Rules published at
 *         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
 *         accordance with the GPU-STREAM Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
 *         "tuned GPU-STREAM benchmark results"
 *         "based on a variant of the GPU-STREAM benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
 *     3c. Submission of results to the GPU-STREAM benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
 *  5. Absolutely no warranty is expressed or implied.
 *———————————————————————————————————-----------------------------------------*/
 #include <iostream>
 #include <fstream>
 #include <vector>
 #include <chrono>
 #include <cfloat>
 #include <cmath>
 #define CL_HPP_ENABLE_EXCEPTIONS
 #define CL_HPP_MINIMUM_OPENCL_VERSION 110
 #define CL_HPP_TARGET_OPENCL_VERSION 110
 #include "CL/cl2.hpp"
 #include "common.h"
 std::string getDeviceName(const cl::Device& device);
 std::string getDeviceDriver(const cl::Device& device);
 unsigned getDeviceList(std::vector<cl::Device>& devices);
 // Print error and exit
 void die(std::string msg, cl::Error& e)
 {
    std::cerr
            << "Error: "
            << msg
            << ": " << e.what()
            << "(" << e.err() << ")"
            << std::endl;
    exit(e.err());
 }
 int main(int argc, char *argv[])
 {
    // Print out run information
    std::cout
        << "GPU-STREAM" << std::endl
        << "Version: " << VERSION_STRING << std::endl
        << "Implementation: OpenCL" << std::endl;
    std::string status;
    try
    {
        parseArguments(argc, argv);
        if (NTIMES < 2)
            throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
        std::cout << "Precision: ";
        if (useFloat) std::cout << "float";
        else std::cout << "double";
        std::cout << std::endl << std::endl;
        std::cout << "Running kernels " << NTIMES << " times" << std::endl;
        if (ARRAY_SIZE % 1024 != 0)
        {
            unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
            ARRAY_SIZE -= ARRAY_SIZE % 1024;
            std::cout
                << "Warning: array size must divide 1024" << std::endl
                << "Resizing array from " << OLD_ARRAY_SIZE
                << " to " << ARRAY_SIZE << std::endl;
            if (ARRAY_SIZE == 0)
                throw std::runtime_error("Array size must be >= 1024");
        }
        // Get precision (used to reset later)
        std::streamsize ss = std::cout.precision();
        size_t DATATYPE_SIZE;
        if (useFloat)
        {
            DATATYPE_SIZE = sizeof(float);
        }
        else
        {
            DATATYPE_SIZE = sizeof(double);
        }
        // Display number of bytes in array
        std::cout << std::setprecision(1) << std::fixed
            << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
            << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
            << std::endl;
        std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
            << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
            << std::endl;
        // Reset precision
        std::cout.precision(ss);
        // Open the Kernel source
        std::string kernels;
        {
            std::ifstream in("ocl-stream-kernels.cl");
            if (!in.is_open())
                throw std::runtime_error("Cannot open kernel file");
            kernels = std::string (std::istreambuf_iterator<char>(in), (std::istreambuf_iterator<char>()));
        }
        // Setup OpenCL
        // Get list of devices
        std::vector<cl::Device> devices;
        getDeviceList(devices);
        // Check device index is in range
        if (deviceIndex >= devices.size())
            throw std::runtime_error("Chosen device index is invalid");
        cl::Device device = devices[deviceIndex];
        status = "Creating context";
        cl::Context context(device);
        status = "Creating queue";
        cl::CommandQueue queue(context);
        status = "Creating program";
        cl::Program program(context, kernels);
        // Print out device name
        std::string name = getDeviceName(device);
        std::cout << "Using OpenCL device " << name << std::endl;
        // Print out OpenCL driver version for this device
        std::string driver = getDeviceDriver(device);
        std::cout << "Driver: " << driver << std::endl;
        // Check device can do double precision if requested
        if (!useFloat && !device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
            throw std::runtime_error("Device does not support double precision, please use --float");
        // Check buffers fit on the device
        status = "Getting device memory sizes";
        cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
        cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
        if (maxbuffer < DATATYPE_SIZE*ARRAY_SIZE)
            throw std::runtime_error("Device cannot allocate a buffer big enough");
        if (totalmem < 3*DATATYPE_SIZE*ARRAY_SIZE)
            throw std::runtime_error("Device does not have enough memory for all 3 buffers");
        try
        {
            std::string options = "";
            if (useFloat)
                options = "-DFLOAT";
            program.build(options.c_str());
        }
        catch (cl::Error& e)
        {
            std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
            std::string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
            std::cerr
                << "Build error:"
                << buildlog
                << std::endl;
            throw e;
        }
        status = "Making kernel copy";
        auto copy = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "copy");
        status = "Making kernel mul";
        auto mul = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "mul");
        status = "Making kernel add";
        auto add = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "add");
        status = "Making kernel triad";
        auto triad = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "triad");
        // Create host vectors
        void *h_a = malloc(ARRAY_SIZE * DATATYPE_SIZE);
        void *h_b = malloc(ARRAY_SIZE * DATATYPE_SIZE);
        void *h_c = malloc(ARRAY_SIZE * DATATYPE_SIZE);
        // Initilise arrays
        for (unsigned int i = 0; i < ARRAY_SIZE; i++)
        {
            if (useFloat)
            {
                ((float*)h_a)[i] = 1.0f;
                ((float*)h_b)[i] = 2.0f;
                ((float*)h_c)[i] = 0.0f;
            }
            else
            {
                ((double*)h_a)[i] = 1.0;
                ((double*)h_b)[i] = 2.0;
                ((double*)h_c)[i] = 0.0;
            }
        }
        // Create device buffers
        status = "Creating buffers";
        cl::Buffer d_a(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
        cl::Buffer d_b(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
        cl::Buffer d_c(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
        // Copy host memory to device
        status = "Copying buffers";
        queue.enqueueWriteBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
        queue.enqueueWriteBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
        queue.enqueueWriteBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
        // Make sure the copies are finished
        queue.finish();
        // List of times
        std::vector< std::vector<double> > timings;
        // Declare timers
        std::chrono::high_resolution_clock::time_point t1, t2;
        // Main loop
        for (unsigned int k = 0; k < NTIMES; k++)
        {
            status = "Executing copy";
            std::vector<double> times;
            t1 = std::chrono::high_resolution_clock::now();
            copy(
                cl::EnqueueArgs(
                queue,
                cl::NDRange(ARRAY_SIZE)),
                d_a, d_c);
            queue.finish();
            t2 = std::chrono::high_resolution_clock::now();
            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
            status = "Executing mul";
            t1 = std::chrono::high_resolution_clock::now();
            mul(
                cl::EnqueueArgs(
                queue,
                cl::NDRange(ARRAY_SIZE)),
                d_b, d_c);
            queue.finish();
            t2 = std::chrono::high_resolution_clock::now();
            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
            status = "Executing add";
            t1 = std::chrono::high_resolution_clock::now();
            add(
                cl::EnqueueArgs(
                queue,
                cl::NDRange(ARRAY_SIZE)),
                d_a, d_b, d_c);
            queue.finish();
            t2 = std::chrono::high_resolution_clock::now();
            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
            status = "Executing triad";
            t1 = std::chrono::high_resolution_clock::now();
            triad(
                cl::EnqueueArgs(
                queue,
                cl::NDRange(ARRAY_SIZE)),
                d_a, d_b, d_c);
            queue.finish();
            t2 = std::chrono::high_resolution_clock::now();
            times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
            timings.push_back(times);
        }
        // Check solutions
        status = "Copying back buffers";
        queue.enqueueReadBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
        queue.enqueueReadBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
        queue.enqueueReadBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
        queue.finish();
        if (useFloat)
        {
            check_solution<float>(h_a, h_b, h_c);
        }
        else
        {
            check_solution<double>(h_a, h_b, h_c);
        }
        // Crunch results
        size_t sizes[4] = {
            2 * DATATYPE_SIZE * ARRAY_SIZE,
            2 * DATATYPE_SIZE * ARRAY_SIZE,
            3 * DATATYPE_SIZE * ARRAY_SIZE,
            3 * DATATYPE_SIZE * ARRAY_SIZE
        };
        double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
        double max[4] = {0.0, 0.0, 0.0, 0.0};
        double avg[4] = {0.0, 0.0, 0.0, 0.0};
        // Ignore first result
        for (unsigned int i = 1; i < NTIMES; i++)
        {
            for (int j = 0; j < 4; j++)
            {
                avg[j] += timings[i][j];
                min[j] = std::min(min[j], timings[i][j]);
                max[j] = std::max(max[j], timings[i][j]);
            }
        }
        for (int j = 0; j < 4; j++)
            avg[j] /= (double)(NTIMES-1);
        // Display results
        std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
        std::cout
            << std::left << std::setw(12) << "Function"
            << std::left << std::setw(12) << "MBytes/sec"
            << std::left << std::setw(12) << "Min (sec)"
            << std::left << std::setw(12) << "Max"
            << std::left << std::setw(12) << "Average"
            << std::endl;
        for (int j = 0; j < 4; j++)
        {
            std::cout
                << std::left << std::setw(12) << labels[j]
                << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
                << std::left << std::setw(12) << std::setprecision(5) << min[j]
                << std::left << std::setw(12) << std::setprecision(5) << max[j]
                << std::left << std::setw(12) << std::setprecision(5) << avg[j]
                << std::endl;
        }
        // Free host vectors
        free(h_a);
        free(h_b);
        free(h_c);
    }
    catch (cl::Error &e)
    {
        die(status, e);
    }
    catch (std::exception& e)
    {
        std::cerr
            << "Error: "
            << e.what()
            << std::endl;
        exit(EXIT_FAILURE);
    }
 }
 unsigned getDeviceList(std::vector<cl::Device>& devices)
 {
    // Get list of platforms
    std::vector<cl::Platform> platforms;
    try
    {
        cl::Platform::get(&platforms);
    }
    catch (cl::Error &e)
    {
        die("Getting platforms", e);
    }
    // Enumerate devices
    for (unsigned int i = 0; i < platforms.size(); i++)
    {
        std::vector<cl::Device> plat_devices;
        try
        {
            platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
        }
        catch (cl::Error &e)
        {
            die("Getting devices", e);
        }
        devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
    }
    return devices.size();
 }
 std::string getDeviceName(const cl::Device& device)
 {
    std::string name;
    cl_device_info info = CL_DEVICE_NAME;
    try
    {
        // Special case for AMD
 #ifdef CL_DEVICE_BOARD_NAME_AMD
        device.getInfo(CL_DEVICE_VENDOR, &name);
        if (strstr(name.c_str(), "Advanced Micro Devices"))
            info = CL_DEVICE_BOARD_NAME_AMD;
 #endif
        device.getInfo(info, &name);
    }
    catch (cl::Error &e)
    {
        die("Getting device name", e);
    }
    return name;
 }
 std::string getDeviceDriver(const cl::Device& device)
 {
    std::string driver;
    try
    {
        device.getInfo(CL_DRIVER_VERSION, &driver);
    }
    catch (cl::Error &e)
    {
        die("Getting device driver", e);
    }
    return driver;
 }
 void listDevices(void)
 {
    // Get list of devices
    std::vector<cl::Device> devices;
    getDeviceList(devices);
    // Print device names
    if (devices.size() == 0)
    {
        std::cout << "No devices found." << std::endl;
    }
    else
    {
        std::cout << std::endl;
        std::cout << "Devices:" << std::endl;
        for (unsigned i = 0; i < devices.size(); i++)
        {
            std::cout << i << ": " << getDeviceName(devices[i]) << std::endl;
        }
        std::cout << std::endl;
    }
 }
--- a/runcuda.sh
+++ b/runcuda.sh
@ -1,4 +0,0 @@
 ./gpu-stream-cuda
 ./gpu-stream-cuda --groups 64 --groupSize 256
 ./gpu-stream-cuda --float
 ./gpu-stream-cuda --float --groups 64 --groupSize 256
--- a/runhip.sh
+++ b/runhip.sh
@ -1,4 +0,0 @@
 ./gpu-stream-hip
 ./gpu-stream-hip --groups 64 --groupSize 256
 ./gpu-stream-hip --float
 ./gpu-stream-hip --float --groups 64 --groupSize 256