Merge branch 'master' into cuda-memory

2017-05-11 11:28:33 +01:00 · 2017-05-11 11:28:33 +01:00 · 3bd65a0716
commit 3bd65a0716
parent 8d66a27131 dafc63030f
23 changed files with 485 additions and 562 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,18 +1,19 @@

-gpu-stream-cuda
-gpu-stream-ocl
-gpu-stream-acc
-gpu-stream-omp3
-gpu-stream-omp45
-gpu-stream-sycl
+cuda-stream
+ocl-stream
+omp-stream
+acc-stream
+raja-stream
+kokkos-stream
+sycl-stream
+hip-stream

 *.o
+*.bc
+*.sycl
 *.tar
 *.gz

 .DS_Store

-CMakeCache.txt
-CMakeFiles/
-cmake_install.cmake
 Makefile
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,198 +0,0 @@
-
-cmake_minimum_required(VERSION 3.2)
-
-if(NOT DEFINED HIP_PATH)
-    if(NOT DEFINED ENV{HIP_PATH})
-        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-    else()
-        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-    endif()
-endif()
-set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-
-
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-project(gpu-stream)
-
-include(CheckIncludeFileCXX)
-include(CheckCXXCompilerFlag)
-
-set(gpu-stream_VERSION_MAJOR 2)
-set(gpu-stream_VERSION_MINOR 2)
-
-configure_file(common.h.in common.h)
-include_directories(${CMAKE_BINARY_DIR})
-
-# Use 'Release' if no build type specified
-if (NOT CMAKE_BUILD_TYPE)
-   message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
-   set(CMAKE_BUILD_TYPE "Release")
-endif()
-
-# If using the Cray compiler, manually add the C++11 flag because setting the
-# standard through CMake as above doesn't set this flag with Cray
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
-  list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
-endif ()
-
-#-------------------------------------------------------------------------------
-# HIP
-#-------------------------------------------------------------------------------
-find_package(HIP QUIET)
-if(${HIP_FOUND})
-  list(APPEND HIP_HIPCC_FLAGS --std=c++11)
-  hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu)
-  target_compile_definitions(gpu-stream-hip PUBLIC HIP)
-else()
-  message("Skipping HIP...")
-endif()
-
-#-------------------------------------------------------------------------------
-# CUDA
-#-------------------------------------------------------------------------------
-find_package(CUDA 7.0 QUIET)
-set(FLAG True)
-if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
-  execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
-  if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
-    message("Xcode version not supported by CUDA")
-    set(FLAG False)
-  endif ()
-endif ()
-if (${FLAG} AND ${CUDA_FOUND})
-  list(APPEND CUDA_NVCC_FLAGS --std=c++11)
-  cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
-  target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
-else ()
-  message("Skipping CUDA...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenCL
-#-------------------------------------------------------------------------------
-find_package(OpenCL QUIET)
-if (${OpenCL_FOUND})
-  add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
-  target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
-  target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
-else ()
-  message("Skipping OpenCL...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenACC
-#-------------------------------------------------------------------------------
-# Check compiler supports an OpenACC flag
-include(CheckCXXCompilerFlag)
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
-  CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
-  if (OPENACC)
-    list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
-  endif ()
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
-  CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
-  CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
-endif ()
-
-if (OPENACC)
-  add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
-  target_compile_definitions(gpu-stream-acc PUBLIC ACC)
-else ()
-  message("Skipping OpenACC...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenMP 3.0
-#-------------------------------------------------------------------------------
-find_package(OpenMP QUIET)
-if (${OpenMP_FOUND})
-  add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
-  target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
-else ()
-  message("Skipping OpenMP 3...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenMP 4.5
-#-------------------------------------------------------------------------------
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
-    add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
-    target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
-  endif ()
-endif ()
-
-#-------------------------------------------------------------------------------
-# RAJA
-#-------------------------------------------------------------------------------
-
-if (RAJA_PATH)
-  find_package(OpenMP)
-  find_package(CUDA 7.5)
-  list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
-  list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
-  list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
-  list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
-  cuda_include_directories(${RAJA_PATH}/include)
-  set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-  cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
-  target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
-  target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
-else()
-  message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
-endif()
-
-#-------------------------------------------------------------------------------
-# Kokkos
-#-------------------------------------------------------------------------------
-if (KOKKOS_PATH)
-  if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
-    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
-  else()
-    message("Skipping Kokkos (requires Linux)")
-  endif()
-else()
-  message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
-endif()
-
-#-------------------------------------------------------------------------------
-# SYCL
-#-------------------------------------------------------------------------------
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
-    "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
-  # Use C++14 if available, otherwise drop back to C++11
-  check_cxx_compiler_flag("-std=c++14" CXX14)
-  if (CXX14)
-    set(CMAKE_REQUIRED_FLAGS "-std=c++14")
-  else()
-    set(CMAKE_REQUIRED_FLAGS "-std=c++11")
-  endif()
-endif()
-
-check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
-if (HAS_SYCL)
-  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
-  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
-
-  # The user must define this in order to use FindComputeCpp
-  if (COMPUTECPP_PACKAGE_ROOT_DIR)
-    message(STATUS "Using ComputeCpp for SYCL compilation")
-    include(FindComputeCpp)
-
-    include_directories(${COMPUTECPP_INCLUDE_DIRECTORY})
-
-    set(SOURCE_NAME "SYCLStream")
-
-    target_compile_options(gpu-stream-sycl PUBLIC ${HOST_COMPILER_OPTIONS})
-    add_sycl_to_target(gpu-stream-sycl ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_NAME}.cpp 
-                      ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    message(STATUS "Using header-only SYCL implementation")
-    set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
-  endif()
-else ()
-  message("Skipping SYCL...")
-endif (HAS_SYCL)
--- a/CUDA.make
+++ b/CUDA.make
@ -0,0 +1,10 @@
+CXXFLAGS=-O3
+CUDA_CXX=nvcc
+
+cuda-stream: main.cpp CUDAStream.cu
+	$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f cuda-stream
+
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@ -215,9 +215,7 @@ void CUDAStream<T>::triad()
 template <class T>
 __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
 {
-
-  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
-  T *tb_sum = reinterpret_cast<T*>(smem);
+  __shared__ T tb_sum[TBSIZE];

  int i = blockDim.x * blockIdx.x + threadIdx.x;
  const size_t local_i = threadIdx.x;
@ -242,7 +240,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array
 template <class T>
 T CUDAStream<T>::dot()
 {
-  dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE, sizeof(T)*TBSIZE>>>(d_a, d_b, d_sum, array_size);
+  dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
  check_error();

 #if defined(MANAGED) || defined(PAGEFAULT)
--- a/HIP.make
+++ b/HIP.make
@ -0,0 +1,12 @@
+
+# TODO: HIP with HCC
+
+HIPCC = hipcc
+
+hip-stream: main.cpp HIPStream.cpp
+	$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f hip-stream
+
--- a/HIPStream.cpp
+++ b/HIPStream.cpp
@ -9,6 +9,7 @@
 #include "hip/hip_runtime.h"

 #define TBSIZE 1024
+#define DOT_NUM_BLOCKS 256

 void check_error(void)
 {
@ -47,6 +48,9 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)

  array_size = ARRAY_SIZE;

+  // Allocate the host array for partial sums for dot kernels
+  sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
+
  // Check buffers fit on the device
  hipDeviceProp_t props;
  hipGetDeviceProperties(&props, 0);
@ -60,20 +64,27 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
  check_error();
  hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
  check_error();
+  hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
+  check_error();
 }


 template <class T>
 HIPStream<T>::~HIPStream()
 {
+  free(sums);
+
  hipFree(d_a);
  check_error();
  hipFree(d_b);
  check_error();
  hipFree(d_c);
  check_error();
+  hipFree(d_sum);
+  check_error();
 }

+
 template <typename T>
 __global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
 {
@ -171,6 +182,46 @@ void HIPStream<T>::triad()
  check_error();
 }

+template <class T>
+__global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
+{
+  __shared__ T tb_sum[TBSIZE];
+
+  int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  const size_t local_i = hipThreadIdx_x;
+
+  tb_sum[local_i] = 0.0;
+  for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
+    tb_sum[local_i] += a[i] * b[i];
+
+  for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
+  {
+    __syncthreads();
+    if (local_i < offset)
+    {
+      tb_sum[local_i] += tb_sum[local_i+offset];
+    }
+  }
+
+  if (local_i == 0)
+    sum[hipBlockIdx_x] = tb_sum[local_i];
+}
+
+template <class T>
+T HIPStream<T>::dot()
+{
+  hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
+  check_error();
+
+  hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
+  check_error();
+
+  T sum = 0.0;
+  for (int i = 0; i < DOT_NUM_BLOCKS; i++)
+    sum += sums[i];
+
+  return sum;
+}

 void listDevices(void)
 {
--- a/HIPStream.h
+++ b/HIPStream.h
@ -21,10 +21,15 @@ class HIPStream : public Stream<T>
  protected:
    // Size of arrays
    unsigned int array_size;
+
+    // Host array for partial sums for dot kernel
+    T *sums;
+
    // Device side pointers to arrays
    T *d_a;
    T *d_b;
    T *d_c;
+    T *d_sum;


  public:
@ -36,6 +41,7 @@ class HIPStream : public Stream<T>
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
+    virtual T dot() override;

    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@ -39,10 +39,10 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
  View<double*, DEVICE> a(*d_a);
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    a[index] = initA;
-    b[index] - initB;
+    b[index] = initB;
    c[index] = initC;
  });
  Kokkos::fence();
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);

-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    c[index] = a[index];
  });
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
  View<double*, DEVICE> c(*d_c);

  const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    b[index] = scalar*c[index];
  });
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);

-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    c[index] = a[index] + b[index];
  });
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
  View<double*, DEVICE> c(*d_c);

  const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    a[index] = b[index] + scalar*c[index];
  });
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()

  T sum = 0.0;

-  parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
+  parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
  {
    tmp += a[index] * b[index];
  }, sum);
--- a/Kokkos.make
+++ b/Kokkos.make
@ -0,0 +1,45 @@
+
+default: kokkos-stream
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU INTEL
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+COMPILER_GNU = g++
+COMPILER_INTEL = icpc -qopt-streaming-stores=always
+CXX = $(COMPILER_$(COMPILER))
+
+ifndef TARGET
+define target_help
+Set TARGET to change to offload device. Defaulting to CPU.
+Available targets are:
+  CPU (default)
+  GPU
+endef
+$(info $(target_help))
+TARGET=CPU
+endif
+
+ifeq ($(TARGET), CPU)
+TARGET_DEF = -DKOKKOS_TARGET_CPU
+else ifeq ($(TARGET), GPU)
+CXX = $(NVCC_WRAPPER)
+TARGET_DEF =
+endif
+
+kokkos-stream: main.cpp KOKKOSStream.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(KOKKOS_LDFLAGS) main.cpp KOKKOSStream.cpp $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
+
+.PHONY: clean
+clean:
+	rm -f kokkos-stream
+
--- a/10
+++ b/10
@ -1,10 +0,0 @@
-
-default: gpu-stream-kokkos
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-gpu-stream-kokkos: main.o KOKKOSStream.o
-	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3
--- a/11
+++ b/11
@ -1,11 +0,0 @@
-
-default: gpu-stream-kokkos
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-gpu-stream-kokkos: main.o KOKKOSStream.o
-	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3
-
--- a/14
+++ b/14
@ -12,22 +12,22 @@
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
+*     3a. In order to be referred to as "BabelStream benchmark results",
+*         published results must be in conformance to the BabelStream
 *         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
+*         accordance with the BabelStream Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
+*         "tuned BabelStream benchmark results"
+*         "based on a variant of the BabelStream benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
+*     3c. Submission of results to the BabelStream benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
--- a/OMPStream.cpp
+++ b/OMPStream.cpp
@ -7,6 +7,10 @@

 #include "OMPStream.h"

+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
 template <class T>
 OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
 {
@ -22,9 +26,9 @@ OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev
  {}
 #else
  // Allocate on the host
-  this->a = (T*)malloc(sizeof(T)*array_size);
-  this->b = (T*)malloc(sizeof(T)*array_size);
-  this->c = (T*)malloc(sizeof(T)*array_size);
+  this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #endif
 }

--- a/OpenACC.make
+++ b/OpenACC.make
@ -0,0 +1,52 @@
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to ensure correct flags are set.
+Available compilers are:
+  PGI CRAY
+endef
+$(info $(compiler_help))
+endif
+
+COMPILER_ = $(CXX)
+COMPILER_PGI = pgc++
+COMPILER_CRAY = CC
+
+FLAGS_ = -O3 -std=c++11
+
+FLAGS_PGI = -std=c++11 -O3 -acc
+ifeq ($(COMPILER), PGI)
+define target_help
+Set a TARGET to ensure PGI targets the correct offload device.
+Available targets are:
+  SNB, IVB, HSW
+  KEPLER, MAXWELL, PASCAL
+  HAWAII
+endef
+ifndef TARGET
+$(error $(target_help))
+endif
+TARGET_FLAGS_SNB     = -ta=multicore -tp=sandybridge
+TARGET_FLAGS_IVB     = -ta=multicore -tp=ivybridge
+TARGET_FLAGS_HSW     = -ta=multicore -tp=haswell
+TARGET_FLAGS_KEPLER  = -ta=nvidia:cc35
+TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
+TARGET_FLAGS_PASCAL  = -ta=nvidia:cc60
+TARGET_FLAGS_HAWAII  = -ta=radeon:hawaii
+ifeq ($(TARGET_FLAGS_$(TARGET)),)
+$(error $(target_help))
+endif
+
+FLAGS_PGI += $(TARGET_FLAGS_$(TARGET))
+
+endif
+
+FLAGS_CRAY = -hstd=c++11
+CXXFLAGS = $(FLAGS_$(COMPILER))
+
+acc-stream: main.cpp ACCStream.cpp
+	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f acc-stream main.o ACCStream.o
--- a/OpenCL.make
+++ b/OpenCL.make
@ -0,0 +1,39 @@
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU CLANG INTEL CRAY
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+COMPILER_GNU = g++
+COMPILER_CLANG = clang++
+COMPILER_INTEL = icpc
+COMPILER_CRAY = CC
+CXX = $(COMPILER_$(COMPILER))
+
+FLAGS_ = -O3 -std=c++11
+FLAGS_GNU = -O3 -std=c++11
+FLAGS_CLANG = -O3 -std=c++11
+FLAGS_INTEL = -O3 -std=c++11
+FLAGS_CRAY = -O3 -hstd=c++11
+CXXFLAGS=$(FLAGS_$(COMPILER))
+
+PLATFORM = $(shell uname -s)
+ifeq ($(PLATFORM), Darwin)
+  LIBS = -framework OpenCL
+else
+  LIBS = -lOpenCL
+endif
+
+ocl-stream: main.cpp OCLStream.cpp
+	$(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f ocl-stream
+
--- a/OpenMP.make
+++ b/OpenMP.make
@ -0,0 +1,60 @@
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  CLANG CRAY GNU INTEL XL
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+ifndef TARGET
+define target_help
+Set TARGET to change device (defaulting to CPU).
+Available targets are:
+  CPU NVIDIA
+
+endef
+$(info $(target_help))
+TARGET=CPU
+endif
+
+COMPILER_GNU = g++
+COMPILER_INTEL = icpc
+COMPILER_CRAY = CC
+COMPILER_CLANG = clang++
+COMPILER_XL = xlc++
+CXX = $(COMPILER_$(COMPILER))
+
+FLAGS_GNU = -O3 -std=c++11
+FLAGS_INTEL = -O3 -std=c++11 -xHOST -qopt-streaming-stores=always
+FLAGS_CRAY = -O3 -hstd=c++11
+FLAGS_CLANG = -O3 -std=c++11
+FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11
+CXXFLAGS = $(FLAGS_$(COMPILER))
+
+# OpenMP flags for CPUs
+OMP_GNU_CPU   = -fopenmp
+OMP_INTEL_CPU = -qopenmp
+OMP_CRAY_CPU  = -homp
+OMP_CLANG_CPU = -fopenmp=libomp
+OMP_XL_CPU = -qsmp=omp -qthreaded
+
+# OpenMP flags for NVIDIA
+OMP_CRAY_NVIDIA  = -DOMP_TARGET_GPU
+OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
+
+ifndef OMP_$(COMPILER)_$(TARGET)
+$(error Targeting $(TARGET) with $(COMPILER) not supported)
+endif
+
+OMP = $(OMP_$(COMPILER)_$(TARGET))
+
+omp-stream: main.cpp OMPStream.cpp
+	$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f omp-stream
--- a/RAJA.make
+++ b/RAJA.make
@ -0,0 +1,58 @@
+
+ifndef TARGET
+define target_help
+Set TARGET to change to offload device. Defaulting to CPU.
+Available targets are:
+  CPU (default)
+  GPU
+endef
+$(info $(target_help))
+TARGET=CPU
+endif
+
+ifeq ($(TARGET), CPU)
+
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  INTEL GNU CRAY XL
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+CXX_INTEL = icpc
+CXX_GNU   = g++
+CXX_CRAY  = CC
+CXX_XL    = xlc++
+
+CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
+CXXFLAGS_GNU   = -O3 -std=c++11 -fopenmp
+CXXFLAGS_CRAY  = -O3 -hstd=c++11
+CXXFLAGS_XL    = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
+
+CXX = $(CXX_$(COMPILER))
+CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER))
+
+else ifeq ($(TARGET), GPU)
+CXX = nvcc
+
+ifndef ARCH
+define arch_help
+Set ARCH to ensure correct GPU architecture.
+Example:
+  ARCH=sm_35
+endef
+$(error $(arch_help))
+endif
+CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
+endif
+
+raja-stream: main.cpp RAJAStream.cpp
+	$(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
+
+.PHONY: clean
+clean:
+	rm -f raja-stream
+
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -10,6 +10,10 @@
 using RAJA::forall;
 using RAJA::RangeSegment;

+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
 template <class T>
 RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
    : array_size(ARRAY_SIZE)
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
  index_set.push_back(seg);

 #ifdef RAJA_TARGET_CPU
-  d_a = new T[ARRAY_SIZE];
-  d_b = new T[ARRAY_SIZE];
-  d_c = new T[ARRAY_SIZE];
+  d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@ -33,9 +37,9 @@ template <class T>
 RAJAStream<T>::~RAJAStream()
 {
 #ifdef RAJA_TARGET_CPU
-  delete[] d_a;
-  delete[] d_b;
-  delete[] d_c;
+  free(d_a);
+  free(d_b);
+  free(d_c);
 #else
  cudaFree(d_a);
  cudaFree(d_b);
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
 template <class T>
 void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = initA;
    b[index] = initB;
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
 template <class T>
 void RAJAStream<T>::copy()
 {
-  T* a = d_a;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index];
  });
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
 template <class T>
 void RAJAStream<T>::mul()
 {
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    b[index] = scalar*c[index];
  });
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
 template <class T>
 void RAJAStream<T>::add()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index] + b[index];
  });
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
 template <class T>
 void RAJAStream<T>::triad()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = b[index] + scalar*c[index];
  });
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
 template <class T>
 T RAJAStream<T>::dot()
 {
-  T* a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;

  RAJA::ReduceSum<reduce_policy, T> sum(0.0);

-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    sum += a[index] * b[index];
  });
--- a/README.android
+++ b/README.android
@ -0,0 +1,36 @@
+Android (outdated instructions)
+------------------
+
+Assuming you have a recent Android NDK available, you can use the
+toolchain that it provides to build GPU-STREAM. You should first
+use the NDK to generate a standalone toolchain:
+
+    # Select a directory to install the toolchain to
+    ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
+
+    ${NDK}/build/tools/make-standalone-toolchain.sh \
+      --platform=android-14 \
+      --toolchain=arm-linux-androideabi-4.8 \
+      --install-dir=${ANDROID_NATIVE_TOOLCHAIN}
+
+Make sure that the OpenCL headers and library (libOpenCL.so) are
+available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
+
+You should then be able to build GPU-STREAM:
+
+    make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
+
+Copy the executable and OpenCL kernels to the device:
+
+    adb push gpu-stream-ocl /data/local/tmp
+    adb push ocl-stream-kernels.cl /data/local/tmp
+
+Run GPU-STREAM from an adb shell:
+
+    adb shell
+    cd /data/local/tmp
+
+    # Use float if device doesn't support double, and reduce array size
+    ./gpu-stream-ocl --float -n 6 -s 10000000
+
+
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-GPU-STREAM
+BabelStream
 ==========

 Measure memory transfer rates to/from global device memory on GPUs.
@ -16,53 +16,51 @@ Currently implemented are:
  - RAJA
  - SYCL

+This code was previously called GPU-STREAM.
+
 Website
 -------
-[uob-hpc.github.io/GPU-STREAM/](uob-hpc.github.io/GPU-STREAM/)
+[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)

 Usage
 -----

-CMake 3.2 or above is required.
-Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
+Drivers, compiler and software applicable to whichever implementation you would like to build against is required.

-Generate the Makefile with `cmake .`
+We have supplied a series of Makefiles, one for each programming model, to assist with building.
+The Makefiles contain common build options, and should be simple to customise for your needs too.

-Android (outdated instructions)
------------------
+General usage is `make -f <Model>.make`
+Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
+Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.

-Assuming you have a recent Android NDK available, you can use the
-toolchain that it provides to build GPU-STREAM. You should first
-use the NDK to generate a standalone toolchain:
+Pass in extra flags via the `EXTRA_FLAGS` option.

-    # Select a directory to install the toolchain to
-    ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
+The binaries are named in the form `<model>-stream`.

-    ${NDK}/build/tools/make-standalone-toolchain.sh \
-      --platform=android-14 \
-      --toolchain=arm-linux-androideabi-4.8 \
-      --install-dir=${ANDROID_NATIVE_TOOLCHAIN}
+Building Kokkos
+---------------

-Make sure that the OpenCL headers and library (libOpenCL.so) are
-available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
+We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
+```
+For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
+```

-You should then be able to build GPU-STREAM:
-
-    make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
-
-Copy the executable and OpenCL kernels to the device:
-
-    adb push gpu-stream-ocl /data/local/tmp
-    adb push ocl-stream-kernels.cl /data/local/tmp
-
-Run GPU-STREAM from an adb shell:
-
-    adb shell
-    cd /data/local/tmp
-
-    # Use float if device doesn't support double, and reduce array size
-    ./gpu-stream-ocl --float -n 6 -s 10000000
+Building RAJA
+-------------

+We use the following command to build RAJA using the Intel Compiler.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+```
+For building with CUDA support, we use the following command.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
+```

 Results
 -------
@ -72,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
 Citing
 ------

-You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
+Please cite BabelStream via this reference:

 > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.

-**Other GPU-STREAM publications:**
+**Other BabelStream publications:**

 > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
+You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
+
+> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
+You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).



--- a/SYCL.make
+++ b/SYCL.make
@ -0,0 +1,12 @@
+
+COMPUTECPP_FLAGS = $(shell computecpp_info --dump-device-compiler-flags)
+
+sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
+	$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@
+
+SYCLStream.sycl: SYCLStream.cpp
+	compute++ SYCLStream.cpp $(COMPUTECPP_FLAGS) -c
+
+.PHONY: clean
+clean:
+	rm -f sycl-stream SYCLStream.sycl SYCLStream.bc
--- a/cmake/Modules/FindComputeCpp.cmake
+++ b/cmake/Modules/FindComputeCpp.cmake
@ -1,248 +0,0 @@
-#.rst:
-# FindComputeCpp
-#---------------
-#
-#   Copyright 2016 Codeplay Software Ltd.
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use these files except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-
-#########################
-#  FindComputeCpp.cmake  
-#########################
-#
-#  Tools for finding and building with ComputeCpp.
-#
-#  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp 
-#   installation.
-#  
-#  Latest version of this file can be found at:
-#    https://github.com/codeplaysoftware/computecpp-sdk
-
-# Require CMake version 3.2.2 or higher
-cmake_minimum_required(VERSION 3.2.2)
-
-# Check that a supported host compiler can be found
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # Require at least gcc 4.8
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
-      message(FATAL_ERROR
-        "host compiler - Not found! (gcc version must be at least 4.8)")
-    # Require the GCC dual ABI to be disabled for 5.1 or higher
-    elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
-      set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
-      message(STATUS
-        "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
-    else()
-      message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    # Require at least clang 3.6
-    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
-      message(FATAL_ERROR
-        "host compiler - Not found! (clang version must be at least 3.6)")
-    else()
-      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-else()
-  message(WARNING
-    "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
-endif()
-
-set(COMPUTECPP_64_BIT_DEFAULT ON)
-option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
-        ${COMPUTECPP_64_BIT_DEFAULT})
-mark_as_advanced(COMPUTECPP_64_BIT_CODE)
-
-# Find OpenCL package
-find_package(OpenCL REQUIRED)
-
-# Find ComputeCpp packagee
-if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
-  message(FATAL_ERROR
-    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
-else()
-  message(STATUS "ComputeCpp package - Found")
-endif()
-option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
-
-# Obtain the path to compute++
-find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
-  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
-  message(STATUS "compute++ - Found")
-else()
-  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
-endif()
-
-# Obtain the path to computecpp_info
-find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_INFO_TOOL})
-  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
-  message(STATUS "computecpp_info - Found")
-else()
-  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
-endif()
-
-# Obtain the path to the ComputeCpp runtime library
-find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
-  HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
-  DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
-
-if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
-  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
-  message(STATUS "libComputeCpp.so - Found")
-else()
-  message(FATAL_ERROR "libComputeCpp.so - Not found!")
-endif()
-
-# Obtain the ComputeCpp include directory
-set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
-if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
-  message(FATAL_ERROR "ComputeCpp includes - Not found!")
-else()
-  message(STATUS "ComputeCpp includes - Found")
-endif()
-
-# Obtain the package version
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
-  OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "Package version - Error obtaining version!")
-else()
-  mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
-  message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
-endif()
-
-# Obtain the device compiler flags
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
-  OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
-else()
-  mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
-  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
-endif()
-
-# Check if the platform is supported
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
-  OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "platform - Error checking platform support!")
-else()
-  mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
-  if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
-    message(STATUS "platform - your system can support ComputeCpp")
-  else()
-    message(STATUS "platform - your system CANNOT support ComputeCpp")
-  endif()
-endif()
-
-####################
-#   __build_sycl   
-####################
-#
-#  Adds a custom target for running compute++ and adding a dependency for the
-#  resulting integration header.
-#
-#  targetName : Name of the target.
-#  sourceFile : Source file to be compiled.
-#  binaryDir : Intermediate directory to output the integration header.
-#
-function(__build_spir targetName sourceFile binaryDir)
-  
-  # Retrieve source file name.
-  get_filename_component(sourceFileName ${sourceFile} NAME)
-
-  # Set the path to the Sycl file.
-  set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
-
-  # Add any user-defined include to the device compiler
-  get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
-    INCLUDE_DIRECTORIES)
-  set(device_compiler_includes "")
-  foreach(directory ${includeDirectories})
-    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
-  endforeach()
-  if (CMAKE_INCLUDE_PATH)
-    foreach(directory ${CMAKE_INCLUDE_PATH})
-      set(device_compiler_includes "-I${directory}"
-        ${device_compiler_includes})
-    endforeach()
-  endif()
-
-  # Convert argument list format
-  separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
-
-  # Add custom command for running compute++
-  add_custom_command(
-    OUTPUT ${outputSyclFile}
-    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
-            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
-            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
-            ${device_compiler_includes}
-            -o ${outputSyclFile}
-            -c ${sourceFile}
-    DEPENDS ${sourceFile}
-    WORKING_DIRECTORY ${binaryDir}
-  COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
-
-  # Add a custom target for the generated integration header
-  add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
-
-  # Add a dependency on the integration header
-  add_dependencies(${targetName} ${targetName}_integration_header)
-
-  # Force inclusion of the integration header for the host compiler
-  set(compileFlags -include ${outputSyclFile} "-Wall")
-  target_compile_options(${targetName} PUBLIC ${compileFlags})
-  
-  # Set the host compiler C++ standard to C++11
-  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
-
-  # Disable GCC dual ABI on GCC 5.1 and higher
-  if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
-    set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
-      "_GLIBCXX_USE_CXX11_ABI=0")
-  endif()
-
-endfunction()
-
-#######################
-#  add_sycl_to_target
-#######################
-#
-#  Adds a SYCL compilation custom command associated with an existing
-#  target and sets a dependancy on that new command.
-#
-#  targetName : Name of the target to add a SYCL to.
-#  sourceFile : Source file to be compiled for SYCL.
-#  binaryDir : Intermediate directory to output the integration header.
-#
-function(add_sycl_to_target targetName sourceFile binaryDir)
-
-  # Add custom target to run compute++ and generate the integration header
-  __build_spir(${targetName} ${sourceFile} ${binaryDir})
-
-  # Link with the ComputeCpp runtime library
-  target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
-                        PUBLIC ${OpenCL_LIBRARIES})
-
-endfunction(add_sycl_to_target)
-
--- a/main.cpp
+++ b/main.cpp
@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>

-#define VERSION_STRING "devel"
+#define VERSION_STRING "3.2"

 #include "Stream.h"

@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
 {
  std::cout
-    << "GPU-STREAM" << std::endl
+    << "BabelStream" << std::endl
    << "Version: " << VERSION_STRING << std::endl
    << "Implementation: " << IMPLEMENTATION_STRING << std::endl;