Merge branch 'master' into cuda-memory

2017-05-11 11:28:33 +01:00 · 2017-05-11 11:28:33 +01:00 · 3bd65a0716
commit 3bd65a0716
parent 8d66a27131 dafc63030f
23 changed files with 485 additions and 562 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,18 +1,19 @@
-gpu-stream-cuda
+cuda-stream
-gpu-stream-ocl
+ocl-stream
-gpu-stream-acc
+omp-stream
-gpu-stream-omp3
+acc-stream
-gpu-stream-omp45
+raja-stream
-gpu-stream-sycl
+kokkos-stream
 sycl-stream
 hip-stream
 *.o
 *.bc
 *.sycl
 *.tar
 *.gz
 .DS_Store
 CMakeCache.txt
 CMakeFiles/
 cmake_install.cmake
 Makefile
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,198 +0,0 @@
 cmake_minimum_required(VERSION 3.2)
 if(NOT DEFINED HIP_PATH)
    if(NOT DEFINED ENV{HIP_PATH})
        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
    else()
        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
    endif()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 project(gpu-stream)
 include(CheckIncludeFileCXX)
 include(CheckCXXCompilerFlag)
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 2)
 configure_file(common.h.in common.h)
 include_directories(${CMAKE_BINARY_DIR})
 # Use 'Release' if no build type specified
 if (NOT CMAKE_BUILD_TYPE)
   message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
   set(CMAKE_BUILD_TYPE "Release")
 endif()
 # If using the Cray compiler, manually add the C++11 flag because setting the
 # standard through CMake as above doesn't set this flag with Cray
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
  list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
 endif ()
 #-------------------------------------------------------------------------------
 # HIP
 #-------------------------------------------------------------------------------
 find_package(HIP QUIET)
 if(${HIP_FOUND})
  list(APPEND HIP_HIPCC_FLAGS --std=c++11)
  hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu)
  target_compile_definitions(gpu-stream-hip PUBLIC HIP)
 else()
  message("Skipping HIP...")
 endif()
 #-------------------------------------------------------------------------------
 # CUDA
 #-------------------------------------------------------------------------------
 find_package(CUDA 7.0 QUIET)
 set(FLAG True)
 if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
  execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
  if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
    message("Xcode version not supported by CUDA")
    set(FLAG False)
  endif ()
 endif ()
 if (${FLAG} AND ${CUDA_FOUND})
  list(APPEND CUDA_NVCC_FLAGS --std=c++11)
  cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
  target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
 else ()
  message("Skipping CUDA...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenCL
 #-------------------------------------------------------------------------------
 find_package(OpenCL QUIET)
 if (${OpenCL_FOUND})
  add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
  target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
  target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
 else ()
  message("Skipping OpenCL...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenACC
 #-------------------------------------------------------------------------------
 # Check compiler supports an OpenACC flag
 include(CheckCXXCompilerFlag)
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
  CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
  if (OPENACC)
    list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
  endif ()
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
  CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
 elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
  CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
 endif ()
 if (OPENACC)
  add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
  target_compile_definitions(gpu-stream-acc PUBLIC ACC)
 else ()
  message("Skipping OpenACC...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenMP 3.0
 #-------------------------------------------------------------------------------
 find_package(OpenMP QUIET)
 if (${OpenMP_FOUND})
  add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
  target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
 else ()
  message("Skipping OpenMP 3...")
 endif ()
 #-------------------------------------------------------------------------------
 # OpenMP 4.5
 #-------------------------------------------------------------------------------
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
    add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
    target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
  endif ()
 endif ()
 #-------------------------------------------------------------------------------
 # RAJA
 #-------------------------------------------------------------------------------
 if (RAJA_PATH)
  find_package(OpenMP)
  find_package(CUDA 7.5)
  list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
  list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
  list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
  list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
  cuda_include_directories(${RAJA_PATH}/include)
  set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
  cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
  target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
  target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
 else()
  message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
 endif()
 #-------------------------------------------------------------------------------
 # Kokkos
 #-------------------------------------------------------------------------------
 if (KOKKOS_PATH)
  if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
  else()
    message("Skipping Kokkos (requires Linux)")
  endif()
 else()
  message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
 endif()
 #-------------------------------------------------------------------------------
 # SYCL
 #-------------------------------------------------------------------------------
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
    "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
  # Use C++14 if available, otherwise drop back to C++11
  check_cxx_compiler_flag("-std=c++14" CXX14)
  if (CXX14)
    set(CMAKE_REQUIRED_FLAGS "-std=c++14")
  else()
    set(CMAKE_REQUIRED_FLAGS "-std=c++11")
  endif()
 endif()
 check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
 if (HAS_SYCL)
  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
  # The user must define this in order to use FindComputeCpp
  if (COMPUTECPP_PACKAGE_ROOT_DIR)
    message(STATUS "Using ComputeCpp for SYCL compilation")
    include(FindComputeCpp)
    include_directories(${COMPUTECPP_INCLUDE_DIRECTORY})
    set(SOURCE_NAME "SYCLStream")
    target_compile_options(gpu-stream-sycl PUBLIC ${HOST_COMPILER_OPTIONS})
    add_sycl_to_target(gpu-stream-sycl ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_NAME}.cpp 
                      ${CMAKE_CURRENT_BINARY_DIR})
  else()
    message(STATUS "Using header-only SYCL implementation")
    set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
  endif()
 else ()
  message("Skipping SYCL...")
 endif (HAS_SYCL)
--- a/CUDA.make
+++ b/CUDA.make
@ -0,0 +1,10 @@
 CXXFLAGS=-O3
 CUDA_CXX=nvcc
 cuda-stream: main.cpp CUDAStream.cu
 	$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
 .PHONY: clean
 clean:
 	rm -f cuda-stream
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@ -215,9 +215,7 @@ void CUDAStream<T>::triad()
 template <class T>
 __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
 {
-
+  __shared__ T tb_sum[TBSIZE];
  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
  T *tb_sum = reinterpret_cast<T*>(smem);
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  const size_t local_i = threadIdx.x;
@ -242,7 +240,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array
 template <class T>
 T CUDAStream<T>::dot()
 {
-  dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE, sizeof(T)*TBSIZE>>>(d_a, d_b, d_sum, array_size);
+  dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
  check_error();
 #if defined(MANAGED) || defined(PAGEFAULT)
--- a/HIP.make
+++ b/HIP.make
@ -0,0 +1,12 @@
 # TODO: HIP with HCC
 HIPCC = hipcc
 hip-stream: main.cpp HIPStream.cpp
 	$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
 .PHONY: clean
 clean:
 	rm -f hip-stream
--- a/HIPStream.cpp
+++ b/HIPStream.cpp
@ -9,6 +9,7 @@
 #include "hip/hip_runtime.h"
 #define TBSIZE 1024
 #define DOT_NUM_BLOCKS 256
 void check_error(void)
 {
@ -47,6 +48,9 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
  array_size = ARRAY_SIZE;
  // Allocate the host array for partial sums for dot kernels
  sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
  // Check buffers fit on the device
  hipDeviceProp_t props;
  hipGetDeviceProperties(&props, 0);
@ -60,20 +64,27 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
  check_error();
  hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
  check_error();
  hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
  check_error();
 }
 template <class T>
 HIPStream<T>::~HIPStream()
 {
  free(sums);
  hipFree(d_a);
  check_error();
  hipFree(d_b);
  check_error();
  hipFree(d_c);
  check_error();
  hipFree(d_sum);
  check_error();
 }
 template <typename T>
 __global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
 {
@ -171,6 +182,46 @@ void HIPStream<T>::triad()
  check_error();
 }
 template <class T>
 __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
 {
  __shared__ T tb_sum[TBSIZE];
  int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
  const size_t local_i = hipThreadIdx_x;
  tb_sum[local_i] = 0.0;
  for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
    tb_sum[local_i] += a[i] * b[i];
  for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
  {
    __syncthreads();
    if (local_i < offset)
    {
      tb_sum[local_i] += tb_sum[local_i+offset];
    }
  }
  if (local_i == 0)
    sum[hipBlockIdx_x] = tb_sum[local_i];
 }
 template <class T>
 T HIPStream<T>::dot()
 {
  hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
  check_error();
  hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
  check_error();
  T sum = 0.0;
  for (int i = 0; i < DOT_NUM_BLOCKS; i++)
    sum += sums[i];
  return sum;
 }
 void listDevices(void)
 {
--- a/HIPStream.h
+++ b/HIPStream.h
@ -21,10 +21,15 @@ class HIPStream : public Stream<T>
  protected:
    // Size of arrays
    unsigned int array_size;
    // Host array for partial sums for dot kernel
    T *sums;
    // Device side pointers to arrays
    T *d_a;
    T *d_b;
    T *d_c;
    T *d_sum;
  public:
@ -36,6 +41,7 @@ class HIPStream : public Stream<T>
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual T dot() override;
    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@ -39,10 +39,10 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
  View<double*, DEVICE> a(*d_a);
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    a[index] = initA;
-    b[index] - initB;
+    b[index] = initB;
    c[index] = initC;
  });
  Kokkos::fence();
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    c[index] = a[index];
  });
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
  View<double*, DEVICE> c(*d_c);
  const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    b[index] = scalar*c[index];
  });
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    c[index] = a[index] + b[index];
  });
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
  View<double*, DEVICE> c(*d_c);
  const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    a[index] = b[index] + scalar*c[index];
  });
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
  T sum = 0.0;
-  parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
+  parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
  {
    tmp += a[index] * b[index];
  }, sum);
--- a/Kokkos.make
+++ b/Kokkos.make
@ -0,0 +1,45 @@
 default: kokkos-stream
 include $(KOKKOS_PATH)/Makefile.kokkos
 ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
  GNU INTEL
 endef
 $(info $(compiler_help))
 COMPILER=GNU
 endif
 COMPILER_GNU = g++
 COMPILER_INTEL = icpc -qopt-streaming-stores=always
 CXX = $(COMPILER_$(COMPILER))
 ifndef TARGET
 define target_help
 Set TARGET to change to offload device. Defaulting to CPU.
 Available targets are:
  CPU (default)
  GPU
 endef
 $(info $(target_help))
 TARGET=CPU
 endif
 ifeq ($(TARGET), CPU)
 TARGET_DEF = -DKOKKOS_TARGET_CPU
 else ifeq ($(TARGET), GPU)
 CXX = $(NVCC_WRAPPER)
 TARGET_DEF =
 endif
 kokkos-stream: main.cpp KOKKOSStream.cpp $(KOKKOS_CPP_DEPENDS)
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(KOKKOS_LDFLAGS) main.cpp KOKKOSStream.cpp $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
 .PHONY: clean
 clean:
 	rm -f kokkos-stream
--- a/10
+++ b/10
@ -1,10 +0,0 @@
 default: gpu-stream-kokkos
 include $(KOKKOS_PATH)/Makefile.kokkos
 gpu-stream-kokkos: main.o KOKKOSStream.o
 	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS)
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3
--- a/11
+++ b/11
@ -1,11 +0,0 @@
 default: gpu-stream-kokkos
 include $(KOKKOS_PATH)/Makefile.kokkos
 gpu-stream-kokkos: main.o KOKKOSStream.o
 	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS)
 	$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3
--- a/14
+++ b/14
@ -12,22 +12,22 @@
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
+*     3a. In order to be referred to as "BabelStream benchmark results",
-*         published results must be in conformance to the GPU-STREAM
+*         published results must be in conformance to the BabelStream
 *         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
+*         accordance with the BabelStream Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
+*         "tuned BabelStream benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
+*         "based on a variant of the BabelStream benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
+*     3c. Submission of results to the BabelStream benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
--- a/OMPStream.cpp
+++ b/OMPStream.cpp
@ -7,6 +7,10 @@
 #include "OMPStream.h"
 #ifndef ALIGNMENT
 #define ALIGNMENT (2*1024*1024) // 2MB
 #endif
 template <class T>
 OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
 {
@ -22,9 +26,9 @@ OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev
  {}
 #else
  // Allocate on the host
-  this->a = (T*)malloc(sizeof(T)*array_size);
+  this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
-  this->b = (T*)malloc(sizeof(T)*array_size);
+  this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
-  this->c = (T*)malloc(sizeof(T)*array_size);
+  this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #endif
 }
--- a/OpenACC.make
+++ b/OpenACC.make
@ -0,0 +1,52 @@
 ifndef COMPILER
 define compiler_help
 Set COMPILER to ensure correct flags are set.
 Available compilers are:
  PGI CRAY
 endef
 $(info $(compiler_help))
 endif
 COMPILER_ = $(CXX)
 COMPILER_PGI = pgc++
 COMPILER_CRAY = CC
 FLAGS_ = -O3 -std=c++11
 FLAGS_PGI = -std=c++11 -O3 -acc
 ifeq ($(COMPILER), PGI)
 define target_help
 Set a TARGET to ensure PGI targets the correct offload device.
 Available targets are:
  SNB, IVB, HSW
  KEPLER, MAXWELL, PASCAL
  HAWAII
 endef
 ifndef TARGET
 $(error $(target_help))
 endif
 TARGET_FLAGS_SNB     = -ta=multicore -tp=sandybridge
 TARGET_FLAGS_IVB     = -ta=multicore -tp=ivybridge
 TARGET_FLAGS_HSW     = -ta=multicore -tp=haswell
 TARGET_FLAGS_KEPLER  = -ta=nvidia:cc35
 TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
 TARGET_FLAGS_PASCAL  = -ta=nvidia:cc60
 TARGET_FLAGS_HAWAII  = -ta=radeon:hawaii
 ifeq ($(TARGET_FLAGS_$(TARGET)),)
 $(error $(target_help))
 endif
 FLAGS_PGI += $(TARGET_FLAGS_$(TARGET))
 endif
 FLAGS_CRAY = -hstd=c++11
 CXXFLAGS = $(FLAGS_$(COMPILER))
 acc-stream: main.cpp ACCStream.cpp
 	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
 .PHONY: clean
 clean:
 	rm -f acc-stream main.o ACCStream.o
--- a/OpenCL.make
+++ b/OpenCL.make
@ -0,0 +1,39 @@
 ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
  GNU CLANG INTEL CRAY
 endef
 $(info $(compiler_help))
 COMPILER=GNU
 endif
 COMPILER_GNU = g++
 COMPILER_CLANG = clang++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
 CXX = $(COMPILER_$(COMPILER))
 FLAGS_ = -O3 -std=c++11
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_CLANG = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
 CXXFLAGS=$(FLAGS_$(COMPILER))
 PLATFORM = $(shell uname -s)
 ifeq ($(PLATFORM), Darwin)
  LIBS = -framework OpenCL
 else
  LIBS = -lOpenCL
 endif
 ocl-stream: main.cpp OCLStream.cpp
 	$(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
 .PHONY: clean
 clean:
 	rm -f ocl-stream
--- a/OpenMP.make
+++ b/OpenMP.make
@ -0,0 +1,60 @@
 ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
  CLANG CRAY GNU INTEL XL
 endef
 $(info $(compiler_help))
 COMPILER=GNU
 endif
 ifndef TARGET
 define target_help
 Set TARGET to change device (defaulting to CPU).
 Available targets are:
  CPU NVIDIA
 endef
 $(info $(target_help))
 TARGET=CPU
 endif
 COMPILER_GNU = g++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
 COMPILER_CLANG = clang++
 COMPILER_XL = xlc++
 CXX = $(COMPILER_$(COMPILER))
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11 -xHOST -qopt-streaming-stores=always
 FLAGS_CRAY = -O3 -hstd=c++11
 FLAGS_CLANG = -O3 -std=c++11
 FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11
 CXXFLAGS = $(FLAGS_$(COMPILER))
 # OpenMP flags for CPUs
 OMP_GNU_CPU   = -fopenmp
 OMP_INTEL_CPU = -qopenmp
 OMP_CRAY_CPU  = -homp
 OMP_CLANG_CPU = -fopenmp=libomp
 OMP_XL_CPU = -qsmp=omp -qthreaded
 # OpenMP flags for NVIDIA
 OMP_CRAY_NVIDIA  = -DOMP_TARGET_GPU
 OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
 ifndef OMP_$(COMPILER)_$(TARGET)
 $(error Targeting $(TARGET) with $(COMPILER) not supported)
 endif
 OMP = $(OMP_$(COMPILER)_$(TARGET))
 omp-stream: main.cpp OMPStream.cpp
 	$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
 .PHONY: clean
 clean:
 	rm -f omp-stream
--- a/RAJA.make
+++ b/RAJA.make
@ -0,0 +1,58 @@
 ifndef TARGET
 define target_help
 Set TARGET to change to offload device. Defaulting to CPU.
 Available targets are:
  CPU (default)
  GPU
 endef
 $(info $(target_help))
 TARGET=CPU
 endif
 ifeq ($(TARGET), CPU)
 ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
  INTEL GNU CRAY XL
 endef
 $(info $(compiler_help))
 COMPILER=GNU
 endif
 CXX_INTEL = icpc
 CXX_GNU   = g++
 CXX_CRAY  = CC
 CXX_XL    = xlc++
 CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
 CXXFLAGS_GNU   = -O3 -std=c++11 -fopenmp
 CXXFLAGS_CRAY  = -O3 -hstd=c++11
 CXXFLAGS_XL    = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
 CXX = $(CXX_$(COMPILER))
 CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER))
 else ifeq ($(TARGET), GPU)
 CXX = nvcc
 ifndef ARCH
 define arch_help
 Set ARCH to ensure correct GPU architecture.
 Example:
  ARCH=sm_35
 endef
 $(error $(arch_help))
 endif
 CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
 endif
 raja-stream: main.cpp RAJAStream.cpp
 	$(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
 .PHONY: clean
 clean:
 	rm -f raja-stream
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -10,6 +10,10 @@
 using RAJA::forall;
 using RAJA::RangeSegment;
 #ifndef ALIGNMENT
 #define ALIGNMENT (2*1024*1024) // 2MB
 #endif
 template <class T>
 RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
    : array_size(ARRAY_SIZE)
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
  index_set.push_back(seg);
 #ifdef RAJA_TARGET_CPU
-  d_a = new T[ARRAY_SIZE];
+  d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
-  d_b = new T[ARRAY_SIZE];
+  d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
-  d_c = new T[ARRAY_SIZE];
+  d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@ -33,9 +37,9 @@ template <class T>
 RAJAStream<T>::~RAJAStream()
 {
 #ifdef RAJA_TARGET_CPU
-  delete[] d_a;
+  free(d_a);
-  delete[] d_b;
+  free(d_b);
-  delete[] d_c;
+  free(d_c);
 #else
  cudaFree(d_a);
  cudaFree(d_b);
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
 template <class T>
 void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  T* a = d_a;
+  T* RAJA_RESTRICT a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = initA;
    b[index] = initB;
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
 template <class T>
 void RAJAStream<T>::copy()
 {
-  T* a = d_a;
+  T* RAJA_RESTRICT a = d_a;
-  T* c = d_c;
+  T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index];
  });
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
 template <class T>
 void RAJAStream<T>::mul()
 {
-  T* b = d_b;
+  T* RAJA_RESTRICT b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    b[index] = scalar*c[index];
  });
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
 template <class T>
 void RAJAStream<T>::add()
 {
-  T* a = d_a;
+  T* RAJA_RESTRICT a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index] + b[index];
  });
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
 template <class T>
 void RAJAStream<T>::triad()
 {
-  T* a = d_a;
+  T* RAJA_RESTRICT a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = b[index] + scalar*c[index];
  });
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
 template <class T>
 T RAJAStream<T>::dot()
 {
-  T* a = d_a;
+  T* RAJA_RESTRICT a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT b = d_b;
  RAJA::ReduceSum<reduce_policy, T> sum(0.0);
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    sum += a[index] * b[index];
  });
--- a/README.android
+++ b/README.android
@ -0,0 +1,36 @@
 Android (outdated instructions)
 ------------------
 Assuming you have a recent Android NDK available, you can use the
 toolchain that it provides to build GPU-STREAM. You should first
 use the NDK to generate a standalone toolchain:
    # Select a directory to install the toolchain to
    ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
    ${NDK}/build/tools/make-standalone-toolchain.sh \
      --platform=android-14 \
      --toolchain=arm-linux-androideabi-4.8 \
      --install-dir=${ANDROID_NATIVE_TOOLCHAIN}
 Make sure that the OpenCL headers and library (libOpenCL.so) are
 available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
 You should then be able to build GPU-STREAM:
    make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
 Copy the executable and OpenCL kernels to the device:
    adb push gpu-stream-ocl /data/local/tmp
    adb push ocl-stream-kernels.cl /data/local/tmp
 Run GPU-STREAM from an adb shell:
    adb shell
    cd /data/local/tmp
    # Use float if device doesn't support double, and reduce array size
    ./gpu-stream-ocl --float -n 6 -s 10000000
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-GPU-STREAM
+BabelStream
 ==========
 Measure memory transfer rates to/from global device memory on GPUs.
@ -16,53 +16,51 @@ Currently implemented are:
  - RAJA
  - SYCL
 This code was previously called GPU-STREAM.
 Website
 -------
-[uob-hpc.github.io/GPU-STREAM/](uob-hpc.github.io/GPU-STREAM/)
+[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
 Usage
 -----
-CMake 3.2 or above is required.
+Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
 Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
-Generate the Makefile with `cmake .`
+We have supplied a series of Makefiles, one for each programming model, to assist with building.
 The Makefiles contain common build options, and should be simple to customise for your needs too.
-Android (outdated instructions)
+General usage is `make -f <Model>.make`
------------------
+Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
 Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.
-Assuming you have a recent Android NDK available, you can use the
+Pass in extra flags via the `EXTRA_FLAGS` option.
 toolchain that it provides to build GPU-STREAM. You should first
 use the NDK to generate a standalone toolchain:
-    # Select a directory to install the toolchain to
+The binaries are named in the form `<model>-stream`.
    ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
-    ${NDK}/build/tools/make-standalone-toolchain.sh \
+Building Kokkos
-      --platform=android-14 \
+---------------
      --toolchain=arm-linux-androideabi-4.8 \
      --install-dir=${ANDROID_NATIVE_TOOLCHAIN}
-Make sure that the OpenCL headers and library (libOpenCL.so) are
+We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
-available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
+```
 ../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
 ```
 For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
 ```
 ../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
 ```
-You should then be able to build GPU-STREAM:
+Building RAJA
-
+-------------
    make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
 Copy the executable and OpenCL kernels to the device:
    adb push gpu-stream-ocl /data/local/tmp
    adb push ocl-stream-kernels.cl /data/local/tmp
 Run GPU-STREAM from an adb shell:
    adb shell
    cd /data/local/tmp
    # Use float if device doesn't support double, and reduce array size
    ./gpu-stream-ocl --float -n 6 -s 10000000
 We use the following command to build RAJA using the Intel Compiler.
 ```
 cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
 ```
 For building with CUDA support, we use the following command.
 ```
 cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
 ```
 Results
 -------
@ -72,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
 Citing
 ------
-You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
+Please cite BabelStream via this reference:
 > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
-**Other GPU-STREAM publications:**
+**Other BabelStream publications:**
 > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
 You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
 > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
 You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
--- a/SYCL.make
+++ b/SYCL.make
@ -0,0 +1,12 @@
 COMPUTECPP_FLAGS = $(shell computecpp_info --dump-device-compiler-flags)
 sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
 	$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@
 SYCLStream.sycl: SYCLStream.cpp
 	compute++ SYCLStream.cpp $(COMPUTECPP_FLAGS) -c
 .PHONY: clean
 clean:
 	rm -f sycl-stream SYCLStream.sycl SYCLStream.bc
--- a/cmake/Modules/FindComputeCpp.cmake
+++ b/cmake/Modules/FindComputeCpp.cmake
@ -1,248 +0,0 @@
 #.rst:
 # FindComputeCpp
 #---------------
 #
 #   Copyright 2016 Codeplay Software Ltd.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use these files except in compliance with the License.
 #   You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 #########################
 #  FindComputeCpp.cmake  
 #########################
 #
 #  Tools for finding and building with ComputeCpp.
 #
 #  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp 
 #   installation.
 #  
 #  Latest version of this file can be found at:
 #    https://github.com/codeplaysoftware/computecpp-sdk
 # Require CMake version 3.2.2 or higher
 cmake_minimum_required(VERSION 3.2.2)
 # Check that a supported host compiler can be found
 if(CMAKE_COMPILER_IS_GNUCXX)
    # Require at least gcc 4.8
    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
      message(FATAL_ERROR
        "host compiler - Not found! (gcc version must be at least 4.8)")
    # Require the GCC dual ABI to be disabled for 5.1 or higher
    elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
      set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
      message(STATUS
        "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
    else()
      message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
    endif()
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
    # Require at least clang 3.6
    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
      message(FATAL_ERROR
        "host compiler - Not found! (clang version must be at least 3.6)")
    else()
      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
    endif()
 else()
  message(WARNING
    "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
 endif()
 set(COMPUTECPP_64_BIT_DEFAULT ON)
 option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
        ${COMPUTECPP_64_BIT_DEFAULT})
 mark_as_advanced(COMPUTECPP_64_BIT_CODE)
 # Find OpenCL package
 find_package(OpenCL REQUIRED)
 # Find ComputeCpp packagee
 if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
  message(FATAL_ERROR
    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
 else()
  message(STATUS "ComputeCpp package - Found")
 endif()
 option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
 # Obtain the path to compute++
 find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
 if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
  message(STATUS "compute++ - Found")
 else()
  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
 endif()
 # Obtain the path to computecpp_info
 find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
 if (EXISTS ${COMPUTECPP_INFO_TOOL})
  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
  message(STATUS "computecpp_info - Found")
 else()
  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
 endif()
 # Obtain the path to the ComputeCpp runtime library
 find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
  HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
  DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
 if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
  message(STATUS "libComputeCpp.so - Found")
 else()
  message(FATAL_ERROR "libComputeCpp.so - Not found!")
 endif()
 # Obtain the ComputeCpp include directory
 set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
 if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
  message(FATAL_ERROR "ComputeCpp includes - Not found!")
 else()
  message(STATUS "ComputeCpp includes - Found")
 endif()
 # Obtain the package version
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
  OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
 if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
  message(FATAL_ERROR "Package version - Error obtaining version!")
 else()
  mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
  message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
 endif()
 # Obtain the device compiler flags
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
  OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
 if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
  message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
 else()
  mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
 endif()
 # Check if the platform is supported
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
  OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
 if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
  message(FATAL_ERROR "platform - Error checking platform support!")
 else()
  mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
  if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
    message(STATUS "platform - your system can support ComputeCpp")
  else()
    message(STATUS "platform - your system CANNOT support ComputeCpp")
  endif()
 endif()
 ####################
 #   __build_sycl   
 ####################
 #
 #  Adds a custom target for running compute++ and adding a dependency for the
 #  resulting integration header.
 #
 #  targetName : Name of the target.
 #  sourceFile : Source file to be compiled.
 #  binaryDir : Intermediate directory to output the integration header.
 #
 function(__build_spir targetName sourceFile binaryDir)
  # Retrieve source file name.
  get_filename_component(sourceFileName ${sourceFile} NAME)
  # Set the path to the Sycl file.
  set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
  # Add any user-defined include to the device compiler
  get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
    INCLUDE_DIRECTORIES)
  set(device_compiler_includes "")
  foreach(directory ${includeDirectories})
    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
  endforeach()
  if (CMAKE_INCLUDE_PATH)
    foreach(directory ${CMAKE_INCLUDE_PATH})
      set(device_compiler_includes "-I${directory}"
        ${device_compiler_includes})
    endforeach()
  endif()
  # Convert argument list format
  separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
  # Add custom command for running compute++
  add_custom_command(
    OUTPUT ${outputSyclFile}
    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
            ${device_compiler_includes}
            -o ${outputSyclFile}
            -c ${sourceFile}
    DEPENDS ${sourceFile}
    WORKING_DIRECTORY ${binaryDir}
  COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
  # Add a custom target for the generated integration header
  add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
  # Add a dependency on the integration header
  add_dependencies(${targetName} ${targetName}_integration_header)
  # Force inclusion of the integration header for the host compiler
  set(compileFlags -include ${outputSyclFile} "-Wall")
  target_compile_options(${targetName} PUBLIC ${compileFlags})
  # Set the host compiler C++ standard to C++11
  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
  # Disable GCC dual ABI on GCC 5.1 and higher
  if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
    set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
      "_GLIBCXX_USE_CXX11_ABI=0")
  endif()
 endfunction()
 #######################
 #  add_sycl_to_target
 #######################
 #
 #  Adds a SYCL compilation custom command associated with an existing
 #  target and sets a dependancy on that new command.
 #
 #  targetName : Name of the target to add a SYCL to.
 #  sourceFile : Source file to be compiled for SYCL.
 #  binaryDir : Intermediate directory to output the integration header.
 #
 function(add_sycl_to_target targetName sourceFile binaryDir)
  # Add custom target to run compute++ and generate the integration header
  __build_spir(${targetName} ${sourceFile} ${binaryDir})
  # Link with the ComputeCpp runtime library
  target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
                        PUBLIC ${OpenCL_LIBRARIES})
 endfunction(add_sycl_to_target)
--- a/main.cpp
+++ b/main.cpp
@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>
-#define VERSION_STRING "devel"
+#define VERSION_STRING "3.2"
 #include "Stream.h"
@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
 {
  std::cout
-    << "GPU-STREAM" << std::endl
+    << "BabelStream" << std::endl
    << "Version: " << VERSION_STRING << std::endl
    << "Implementation: " << IMPLEMENTATION_STRING << std::endl;