From b3cf9992bb7784fc7219e2edf289ffeb991a234d Mon Sep 17 00:00:00 2001
From: Simon McIntosh-Smith <simonm@cs.bris.ac.uk>
Date: Mon, 7 Nov 2016 23:35:00 +0000
Subject: [PATCH 01/76] Fixed broken link to new GPU-STREAM webpage

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index b024915..0d181d5 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Currently implemented are:
 
 Website
 -------
-[uob-hpc.github.io/GPU-STREAM/](uob-hpc.github.io/GPU-STREAM/)
+[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/)
 
 Usage
 -----

From edd65dacb1c29749637c5ba1436bf84cc978c4d2 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 22 Nov 2016 20:06:54 +0000
Subject: [PATCH 02/76] Add Kokkos Makefile for CPU

---
 KokkosCPUMakefile | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 KokkosCPUMakefile

diff --git a/KokkosCPUMakefile b/KokkosCPUMakefile
new file mode 100644
index 0000000..caa8b77
--- /dev/null
+++ b/KokkosCPUMakefile
@@ -0,0 +1,10 @@
+
+default: gpu-stream-kokkos
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+gpu-stream-kokkos: main.o KOKKOSStream.o
+	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3

From ec2bf50e750c6f574d42e761129bdfd3f972ea99 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Mon, 30 Jan 2017 13:52:45 +0000
Subject: [PATCH 03/76] Version bump

---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index c1ca69f..2b6e459 100644
--- a/main.cpp
+++ b/main.cpp
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>
 
-#define VERSION_STRING "devel"
+#define VERSION_STRING "3.0"
 
 #include "Stream.h"
 

From caf367f8fbb58fa4f1bc6185269bc95f96980ec8 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Mon, 30 Jan 2017 16:12:09 +0000
Subject: [PATCH 04/76] Remove broken common.h generation from CMake build

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f3439e..5f31cf1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,6 @@ include(CheckCXXCompilerFlag)
 set(gpu-stream_VERSION_MAJOR 2)
 set(gpu-stream_VERSION_MINOR 2)
 
-configure_file(common.h.in common.h)
 include_directories(${CMAKE_BINARY_DIR})
 
 # Use 'Release' if no build type specified

From c320014d2808fb68ee69a8de929f885eec729194 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 14:58:05 +0000
Subject: [PATCH 05/76] Add CUDA Makefile

---
 CUDA.make | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 CUDA.make

diff --git a/CUDA.make b/CUDA.make
new file mode 100644
index 0000000..2434754
--- /dev/null
+++ b/CUDA.make
@@ -0,0 +1,4 @@
+
+cuda-stream: main.cpp CUDAStream.cu
+	nvcc -std=c++11 -O3 -DCUDA $^ -o $@
+

From c904719f2b6479f4fd35bf50c811a2f6fc421ccf Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 14:58:24 +0000
Subject: [PATCH 06/76] Add OpenCL Makefile

---
 OpenCL.make | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 OpenCL.make

diff --git a/OpenCL.make b/OpenCL.make
new file mode 100644
index 0000000..447deca
--- /dev/null
+++ b/OpenCL.make
@@ -0,0 +1,4 @@
+
+ocl-stream: main.cpp OCLStream.cpp
+	$(CXX) -O3 -std=c++11 -DOCL $^ -lOpenCL -o $@
+

From 1d4b809b446612e0fa1c25054451e1040c3f3ff6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 14:58:34 +0000
Subject: [PATCH 07/76] Remove CMake from project

CMake is unable to use multiple compilers within a single build.
We require building multiple binaries, one for each model, and as
such they often require different compilers for each. Therefore
we feel it is simpler to provide a simple, sample Makefile for each
model. Some common configurations will be specalised in due course.
---
 CMakeLists.txt | 197 -------------------------------------------------
 1 file changed, 197 deletions(-)
 delete mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 5f31cf1..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-
-cmake_minimum_required(VERSION 3.2)
-
-if(NOT DEFINED HIP_PATH)
-    if(NOT DEFINED ENV{HIP_PATH})
-        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-    else()
-        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-    endif()
-endif()
-set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-
-
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-project(gpu-stream)
-
-include(CheckIncludeFileCXX)
-include(CheckCXXCompilerFlag)
-
-set(gpu-stream_VERSION_MAJOR 2)
-set(gpu-stream_VERSION_MINOR 2)
-
-include_directories(${CMAKE_BINARY_DIR})
-
-# Use 'Release' if no build type specified
-if (NOT CMAKE_BUILD_TYPE)
-   message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
-   set(CMAKE_BUILD_TYPE "Release")
-endif()
-
-# If using the Cray compiler, manually add the C++11 flag because setting the
-# standard through CMake as above doesn't set this flag with Cray
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
-  list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
-endif ()
-
-#-------------------------------------------------------------------------------
-# HIP
-#-------------------------------------------------------------------------------
-find_package(HIP QUIET)
-if(${HIP_FOUND})
-  list(APPEND HIP_HIPCC_FLAGS --std=c++11)
-  hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu)
-  target_compile_definitions(gpu-stream-hip PUBLIC HIP)
-else()
-  message("Skipping HIP...")
-endif()
-
-#-------------------------------------------------------------------------------
-# CUDA
-#-------------------------------------------------------------------------------
-find_package(CUDA 7.0 QUIET)
-set(FLAG True)
-if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
-  execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
-  if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
-    message("Xcode version not supported by CUDA")
-    set(FLAG False)
-  endif ()
-endif ()
-if (${FLAG} AND ${CUDA_FOUND})
-  list(APPEND CUDA_NVCC_FLAGS --std=c++11)
-  cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
-  target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
-else ()
-  message("Skipping CUDA...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenCL
-#-------------------------------------------------------------------------------
-find_package(OpenCL QUIET)
-if (${OpenCL_FOUND})
-  add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
-  target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
-  target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
-else ()
-  message("Skipping OpenCL...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenACC
-#-------------------------------------------------------------------------------
-# Check compiler supports an OpenACC flag
-include(CheckCXXCompilerFlag)
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
-  CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
-  if (OPENACC)
-    list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
-  endif ()
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
-  CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
-  CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
-endif ()
-
-if (OPENACC)
-  add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
-  target_compile_definitions(gpu-stream-acc PUBLIC ACC)
-else ()
-  message("Skipping OpenACC...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenMP 3.0
-#-------------------------------------------------------------------------------
-find_package(OpenMP QUIET)
-if (${OpenMP_FOUND})
-  add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
-  target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
-else ()
-  message("Skipping OpenMP 3...")
-endif ()
-
-#-------------------------------------------------------------------------------
-# OpenMP 4.5
-#-------------------------------------------------------------------------------
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
-    add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
-    target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
-  endif ()
-endif ()
-
-#-------------------------------------------------------------------------------
-# RAJA
-#-------------------------------------------------------------------------------
-
-if (RAJA_PATH)
-  find_package(OpenMP)
-  find_package(CUDA 7.5)
-  list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
-  list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
-  list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
-  list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
-  cuda_include_directories(${RAJA_PATH}/include)
-  set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-  cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
-  target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
-  target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
-else()
-  message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
-endif()
-
-#-------------------------------------------------------------------------------
-# Kokkos
-#-------------------------------------------------------------------------------
-if (KOKKOS_PATH)
-  if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
-    add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
-  else()
-    message("Skipping Kokkos (requires Linux)")
-  endif()
-else()
-  message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
-endif()
-
-#-------------------------------------------------------------------------------
-# SYCL
-#-------------------------------------------------------------------------------
-if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
-    "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
-  # Use C++14 if available, otherwise drop back to C++11
-  check_cxx_compiler_flag("-std=c++14" CXX14)
-  if (CXX14)
-    set(CMAKE_REQUIRED_FLAGS "-std=c++14")
-  else()
-    set(CMAKE_REQUIRED_FLAGS "-std=c++11")
-  endif()
-endif()
-
-check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
-if (HAS_SYCL)
-  add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
-  target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
-
-  # The user must define this in order to use FindComputeCpp
-  if (COMPUTECPP_PACKAGE_ROOT_DIR)
-    message(STATUS "Using ComputeCpp for SYCL compilation")
-    include(FindComputeCpp)
-
-    include_directories(${COMPUTECPP_INCLUDE_DIRECTORY})
-
-    set(SOURCE_NAME "SYCLStream")
-
-    target_compile_options(gpu-stream-sycl PUBLIC ${HOST_COMPILER_OPTIONS})
-    add_sycl_to_target(gpu-stream-sycl ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_NAME}.cpp 
-                      ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    message(STATUS "Using header-only SYCL implementation")
-    set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
-  endif()
-else ()
-  message("Skipping SYCL...")
-endif (HAS_SYCL)

From 436c3899c777c7712aff16efa47e8cd2ff40561b Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:10:23 +0000
Subject: [PATCH 08/76] Add OpenMP makefile that targets CPUs and GPUs, with
 common defaults.

---
 OpenMP.make | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 OpenMP.make

diff --git a/OpenMP.make b/OpenMP.make
new file mode 100644
index 0000000..115c678
--- /dev/null
+++ b/OpenMP.make
@@ -0,0 +1,29 @@
+
+ifndef COMPILER
+$(info Define a compiler to set common defaults, i.e make COMPILER=GNU)
+endif
+
+COMPILER_ = $(CXX)
+COMPILER_GNU = g++
+COMPILER_INTEL = icpc
+COMPILER_CRAY = CC
+CC = $(COMPILER_$(COMPILER))
+
+FLAGS_ = -O3
+FLAGS_GNU = -O3 -std=c++11
+FLAGS_INTEL = -O3 -std=c++11
+FLAGS_CRAY = -O3 -hstd=c++11
+CFLAGS = $(FLAGS_$(COMPILER))
+
+OMP_ = 
+OMP_GNU   = -fopenmp
+OMP_INTEL = -qopenmp
+OMP_CRAY  =
+OMP = $(OMP_$(COMPILER))
+
+omp-stream: main.cpp OMPStream.cpp
+	$(CC) -O3 -std=c++11 -DOMP $^ $(OMP) -o $@
+
+omp-target-stream: main.cpp OMPStream.cpp
+	$(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP) -o $@
+

From 527a1edd29e7188cef8b23aa3a5f9c71449aaf9f Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:14:59 +0000
Subject: [PATCH 09/76] Rename Kokkos Makefiles

---
 KokkosCPUMakefile => KokkosCPU.make | 0
 KokkosMakefile => KokkosGPU.make    | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename KokkosCPUMakefile => KokkosCPU.make (100%)
 rename KokkosMakefile => KokkosGPU.make (100%)

diff --git a/KokkosCPUMakefile b/KokkosCPU.make
similarity index 100%
rename from KokkosCPUMakefile
rename to KokkosCPU.make
diff --git a/KokkosMakefile b/KokkosGPU.make
similarity index 100%
rename from KokkosMakefile
rename to KokkosGPU.make

From 584e1b208bf85980d41be25b588b002653bd3862 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:22:14 +0000
Subject: [PATCH 10/76] Change binary name for Kokkos builds

---
 KokkosCPU.make | 9 +++++++--
 KokkosGPU.make | 8 ++++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/KokkosCPU.make b/KokkosCPU.make
index caa8b77..a15e2b9 100644
--- a/KokkosCPU.make
+++ b/KokkosCPU.make
@@ -1,10 +1,15 @@
 
-default: gpu-stream-kokkos
+default: kokkos-cpu-stream
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
-gpu-stream-kokkos: main.o KOKKOSStream.o
+kokkos-cpu-stream: main.o KOKKOSStream.o
 	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
 
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS)
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3
+
+.PHONY: clean
+clean:
+	rm -f main.o KOKKOSStream.o
+
diff --git a/KokkosGPU.make b/KokkosGPU.make
index 1104cf4..041c007 100644
--- a/KokkosGPU.make
+++ b/KokkosGPU.make
@@ -1,11 +1,15 @@
 
-default: gpu-stream-kokkos
+default: kokkos-gpu-stream
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
-gpu-stream-kokkos: main.o KOKKOSStream.o
+kokkos-gpu-stream: main.o KOKKOSStream.o
 	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3
 
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS)
 	$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3
 
+.PHONY: clean
+clean:
+	rm -f main.o KOKKOSStream.o
+

From 887ae9bf1b93c4525e1d8ebfe4fe40c7e54948e4 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:30:35 +0000
Subject: [PATCH 11/76] Add SYCL Makefile

---
 SYCL.make | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 SYCL.make

diff --git a/SYCL.make b/SYCL.make
new file mode 100644
index 0000000..5dde5c8
--- /dev/null
+++ b/SYCL.make
@@ -0,0 +1,11 @@
+
+sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
+	$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl -lComputeCpp -lOpenCL -o $@
+
+
+SYCLStream.sycl: SYCLStream.cpp
+	compute++ SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c
+
+.PHONY: clean
+clean:
+	rm -f SYCLStream.sycl

From ce5fde973bbb7f606713499c1476130c10eb5ddc Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:31:38 +0000
Subject: [PATCH 12/76] Combine Kokkos CPU and GPU builds

---
 Kokkos.make    | 28 ++++++++++++++++++++++++++++
 KokkosCPU.make | 15 ---------------
 KokkosGPU.make | 15 ---------------
 3 files changed, 28 insertions(+), 30 deletions(-)
 create mode 100644 Kokkos.make
 delete mode 100644 KokkosCPU.make
 delete mode 100644 KokkosGPU.make

diff --git a/Kokkos.make b/Kokkos.make
new file mode 100644
index 0000000..e30826c
--- /dev/null
+++ b/Kokkos.make
@@ -0,0 +1,28 @@
+
+default: kokkos-stream
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifndef TARGET
+$(info No target defined. Specify CPU or GPU. Defaulting to CPU)
+TARGET=CPU
+endif
+
+ifeq ($(TARGET), CPU)
+COMPILER = $(CXX)
+TARGET_DEF = -DKOKKOS_TARGET_CPU
+else ifeq ($(TARGET), GPU)
+COMPILER = $(NVCC_WRAPPER)
+TARGET_DEF =
+endif
+
+kokkos-stream: main.o KOKKOSStream.o
+	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3
+
+.PHONY: clean
+clean:
+	rm -f main.o KOKKOSStream.o
+
diff --git a/KokkosCPU.make b/KokkosCPU.make
deleted file mode 100644
index a15e2b9..0000000
--- a/KokkosCPU.make
+++ /dev/null
@@ -1,15 +0,0 @@
-
-default: kokkos-cpu-stream
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-kokkos-cpu-stream: main.o KOKKOSStream.o
-	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3
-
-.PHONY: clean
-clean:
-	rm -f main.o KOKKOSStream.o
-
diff --git a/KokkosGPU.make b/KokkosGPU.make
deleted file mode 100644
index 041c007..0000000
--- a/KokkosGPU.make
+++ /dev/null
@@ -1,15 +0,0 @@
-
-default: kokkos-gpu-stream
-
-include $(KOKKOS_PATH)/Makefile.kokkos
-
-kokkos-gpu-stream: main.o KOKKOSStream.o
-	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3
-
-.PHONY: clean
-clean:
-	rm -f main.o KOKKOSStream.o
-

From 07f91e605edf5f86de31b3e4c980b7c1ea33d7d0 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:52:43 +0000
Subject: [PATCH 13/76] Add Makefile for RAJA

---
 RAJA.make | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 RAJA.make

diff --git a/RAJA.make b/RAJA.make
new file mode 100644
index 0000000..72cc5a6
--- /dev/null
+++ b/RAJA.make
@@ -0,0 +1,32 @@
+
+ifndef TARGET
+$(info No target defined. Specify CPU or GPU. Defaulting to CPU)
+TARGET=CPU
+endif
+
+ifeq ($(TARGET), CPU)
+COMP=$(CXX)
+CFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU 
+
+ifndef COMPILER
+$(error No COMPILER defined. Specify COMPILER for correct OpenMP flag.)
+endif
+ifeq ($(COMPILER), INTEL)
+COMP = icpc
+CFLAGS += -qopenmp
+else ifeq ($(COMPILER), GNU)
+COMP = g++
+CFLAGS += -fopenmp
+else ifeq ($(COMPILER), CRAY)
+COMP = CC
+CFLAGS +=
+endif
+
+else ifeq ($(TARGET), GPU)
+COMP = nvcc
+CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp
+endif
+
+raja-stream: main.cpp RAJAStream.cpp
+	$(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ -L$(RAJA_PATH)/lib -lRAJA -o $@
+

From 1b573dbac31e162680a9d25a4755fe4bf372840b Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:55:45 +0000
Subject: [PATCH 14/76] Add EXTRA_FLAGS option to build line for custom extra
 flags

---
 CUDA.make   | 2 +-
 Kokkos.make | 4 ++--
 OpenCL.make | 2 +-
 RAJA.make   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CUDA.make b/CUDA.make
index 2434754..9a7ec88 100644
--- a/CUDA.make
+++ b/CUDA.make
@@ -1,4 +1,4 @@
 
 cuda-stream: main.cpp CUDAStream.cu
-	nvcc -std=c++11 -O3 -DCUDA $^ -o $@
+	nvcc -std=c++11 -O3 -DCUDA $^ $(EXTRA_FLAGS) -o $@
 
diff --git a/Kokkos.make b/Kokkos.make
index e30826c..78e2eaa 100644
--- a/Kokkos.make
+++ b/Kokkos.make
@@ -17,10 +17,10 @@ TARGET_DEF =
 endif
 
 kokkos-stream: main.o KOKKOSStream.o
-	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3
+	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
 
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3
+	$(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
 
 .PHONY: clean
 clean:
diff --git a/OpenCL.make b/OpenCL.make
index 447deca..f48be82 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -1,4 +1,4 @@
 
 ocl-stream: main.cpp OCLStream.cpp
-	$(CXX) -O3 -std=c++11 -DOCL $^ -lOpenCL -o $@
+	$(CXX) -O3 -std=c++11 -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@
 
diff --git a/RAJA.make b/RAJA.make
index 72cc5a6..0f9cd61 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -28,5 +28,5 @@ CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp
 endif
 
 raja-stream: main.cpp RAJAStream.cpp
-	$(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ -L$(RAJA_PATH)/lib -lRAJA -o $@
+	$(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
 

From d3fa52748bc3d44065a76c494ee0e1a4ceb0ec09 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:52:28 +0000
Subject: [PATCH 15/76] Add Clang support to OpenMP.make

---
 OpenMP.make | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/OpenMP.make b/OpenMP.make
index 115c678..f111847 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -7,23 +7,30 @@ COMPILER_ = $(CXX)
 COMPILER_GNU = g++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
+COMPILER_CLANG = clang++
 CC = $(COMPILER_$(COMPILER))
 
 FLAGS_ = -O3
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
+FLAGS_CLANG = -O3 -std=c++11
 CFLAGS = $(FLAGS_$(COMPILER))
 
-OMP_ = 
+OMP_ =
 OMP_GNU   = -fopenmp
 OMP_INTEL = -qopenmp
 OMP_CRAY  =
+OMP_CLANG = -fopenmp=libomp
 OMP = $(OMP_$(COMPILER))
 
+OMP_TARGET_ =
+OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
+OMP_TARGET = $(OMP_TARGET_$(COMPILER))
+
 omp-stream: main.cpp OMPStream.cpp
-	$(CC) -O3 -std=c++11 -DOMP $^ $(OMP) -o $@
+	$(CC) -O3 -std=c++11 -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
 
 omp-target-stream: main.cpp OMPStream.cpp
-	$(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP) -o $@
+	$(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@
 

From 6b26e33dcdc5883295f273eb1f1c5a64fe4f0f2c Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:53:12 +0000
Subject: [PATCH 16/76] Use $(CFLAGS) in OpenMP build rules

---
 OpenMP.make | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/OpenMP.make b/OpenMP.make
index f111847..9cb506a 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -29,8 +29,8 @@ OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
 OMP_TARGET = $(OMP_TARGET_$(COMPILER))
 
 omp-stream: main.cpp OMPStream.cpp
-	$(CC) -O3 -std=c++11 -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
+	$(CC) $(CFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
 
 omp-target-stream: main.cpp OMPStream.cpp
-	$(CC) -O3 -std=c++11 -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@
+	$(CC) $(CFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@
 

From d19cf1a5afb8410cd874a84fb081c3dd43994074 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:54:36 +0000
Subject: [PATCH 17/76] Rename CC and CFLAGS to CXX and CXXFLAGS in OpenMP

---
 OpenMP.make | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/OpenMP.make b/OpenMP.make
index 9cb506a..06d3d00 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -8,14 +8,14 @@ COMPILER_GNU = g++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
 COMPILER_CLANG = clang++
-CC = $(COMPILER_$(COMPILER))
+CXX = $(COMPILER_$(COMPILER))
 
 FLAGS_ = -O3
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
 FLAGS_CLANG = -O3 -std=c++11
-CFLAGS = $(FLAGS_$(COMPILER))
+CXXFLAGS = $(FLAGS_$(COMPILER))
 
 OMP_ =
 OMP_GNU   = -fopenmp
@@ -29,8 +29,8 @@ OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
 OMP_TARGET = $(OMP_TARGET_$(COMPILER))
 
 omp-stream: main.cpp OMPStream.cpp
-	$(CC) $(CFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
+	$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
 
 omp-target-stream: main.cpp OMPStream.cpp
-	$(CC) $(CFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@
+	$(CXX) $(CXXFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@
 

From 2d21f69eb74ff4277a2b271e894ed22fd58e69cb Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 15:56:39 +0000
Subject: [PATCH 18/76] Add OMP_TARGET_ definitions for other compilers

---
 OpenMP.make | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/OpenMP.make b/OpenMP.make
index 06d3d00..bc4e955 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -25,6 +25,9 @@ OMP_CLANG = -fopenmp=libomp
 OMP = $(OMP_$(COMPILER))
 
 OMP_TARGET_ =
+OMP_TARGET_GNU   = -fopenmp
+OMP_TARGET_INTEL =
+OMP_TARGET_CRAY  =
 OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
 OMP_TARGET = $(OMP_TARGET_$(COMPILER))
 

From f3e34a56073711c70d57e4cbdc9b8f1e9fd871ea Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:04:56 +0000
Subject: [PATCH 19/76] Add OpenACC Makefile

---
 OpenACC.make | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 OpenACC.make

diff --git a/OpenACC.make b/OpenACC.make
new file mode 100644
index 0000000..061a9e0
--- /dev/null
+++ b/OpenACC.make
@@ -0,0 +1,30 @@
+
+ifndef COMPILER
+$(info Define a compiler to set common defaults, i.e make COMPILER=GNU)
+endif
+
+COMPILER_ = $(CXX)
+COMPILER_PGI = pgc++
+COMPILER_CRAY = CC
+CXX = $(COMPILER_$(COMPILER))
+
+FLAGS_ = -O3
+
+FLAGS_PGI = -std=c++11 -O3 -acc
+ifeq ($(COMPILER), PGI)
+ifndef TARGET
+$(info Set a TARGET to ensure PGI targets the correct offload device. i.e. TARGET=GPU or CPU)
+endif
+endif
+ifeq ($(TARGET), GPU)
+FLAGS_PGI += -ta=nvidia
+else ifeq ($(TARGET), CPU)
+FLAGS_PGI += -ta=multicore
+endif
+
+FLAGS_CRAY = -hstd=c++11
+CFLAGS = $(FLAGS_$(COMPILER))
+
+acc-stream: main.cpp ACCStream.cpp
+	$(CXX) $(CFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
+

From 488db0eae613d67ee3c9e514c0bded587d5729a5 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:19:04 +0000
Subject: [PATCH 20/76] Remove CMake module

---
 cmake/Modules/FindComputeCpp.cmake | 248 -----------------------------
 1 file changed, 248 deletions(-)
 delete mode 100644 cmake/Modules/FindComputeCpp.cmake

diff --git a/cmake/Modules/FindComputeCpp.cmake b/cmake/Modules/FindComputeCpp.cmake
deleted file mode 100644
index 5e08902..0000000
--- a/cmake/Modules/FindComputeCpp.cmake
+++ /dev/null
@@ -1,248 +0,0 @@
-#.rst:
-# FindComputeCpp
-#---------------
-#
-#   Copyright 2016 Codeplay Software Ltd.
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use these files except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-
-#########################
-#  FindComputeCpp.cmake  
-#########################
-#
-#  Tools for finding and building with ComputeCpp.
-#
-#  User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp 
-#   installation.
-#  
-#  Latest version of this file can be found at:
-#    https://github.com/codeplaysoftware/computecpp-sdk
-
-# Require CMake version 3.2.2 or higher
-cmake_minimum_required(VERSION 3.2.2)
-
-# Check that a supported host compiler can be found
-if(CMAKE_COMPILER_IS_GNUCXX)
-    # Require at least gcc 4.8
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
-      message(FATAL_ERROR
-        "host compiler - Not found! (gcc version must be at least 4.8)")
-    # Require the GCC dual ABI to be disabled for 5.1 or higher
-    elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
-      set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
-      message(STATUS
-        "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
-    else()
-      message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    # Require at least clang 3.6
-    if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
-      message(FATAL_ERROR
-        "host compiler - Not found! (clang version must be at least 3.6)")
-    else()
-      message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
-    endif()
-else()
-  message(WARNING
-    "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
-endif()
-
-set(COMPUTECPP_64_BIT_DEFAULT ON)
-option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
-        ${COMPUTECPP_64_BIT_DEFAULT})
-mark_as_advanced(COMPUTECPP_64_BIT_CODE)
-
-# Find OpenCL package
-find_package(OpenCL REQUIRED)
-
-# Find ComputeCpp packagee
-if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
-  message(FATAL_ERROR
-    "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
-else()
-  message(STATUS "ComputeCpp package - Found")
-endif()
-option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
-
-# Obtain the path to compute++
-find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
-  mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
-  message(STATUS "compute++ - Found")
-else()
-  message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
-endif()
-
-# Obtain the path to computecpp_info
-find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
-  ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
-if (EXISTS ${COMPUTECPP_INFO_TOOL})
-  mark_as_advanced(${COMPUTECPP_INFO_TOOL})
-  message(STATUS "computecpp_info - Found")
-else()
-  message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
-endif()
-
-# Obtain the path to the ComputeCpp runtime library
-find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
-  HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
-  DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
-
-if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
-  mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
-  message(STATUS "libComputeCpp.so - Found")
-else()
-  message(FATAL_ERROR "libComputeCpp.so - Not found!")
-endif()
-
-# Obtain the ComputeCpp include directory
-set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
-if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
-  message(FATAL_ERROR "ComputeCpp includes - Not found!")
-else()
-  message(STATUS "ComputeCpp includes - Found")
-endif()
-
-# Obtain the package version
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
-  OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "Package version - Error obtaining version!")
-else()
-  mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
-  message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
-endif()
-
-# Obtain the device compiler flags
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
-  OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
-else()
-  mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
-  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
-endif()
-
-# Check if the platform is supported
-execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
-  OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
-  RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
-  message(FATAL_ERROR "platform - Error checking platform support!")
-else()
-  mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
-  if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
-    message(STATUS "platform - your system can support ComputeCpp")
-  else()
-    message(STATUS "platform - your system CANNOT support ComputeCpp")
-  endif()
-endif()
-
-####################
-#   __build_sycl   
-####################
-#
-#  Adds a custom target for running compute++ and adding a dependency for the
-#  resulting integration header.
-#
-#  targetName : Name of the target.
-#  sourceFile : Source file to be compiled.
-#  binaryDir : Intermediate directory to output the integration header.
-#
-function(__build_spir targetName sourceFile binaryDir)
-  
-  # Retrieve source file name.
-  get_filename_component(sourceFileName ${sourceFile} NAME)
-
-  # Set the path to the Sycl file.
-  set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
-
-  # Add any user-defined include to the device compiler
-  get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
-    INCLUDE_DIRECTORIES)
-  set(device_compiler_includes "")
-  foreach(directory ${includeDirectories})
-    set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
-  endforeach()
-  if (CMAKE_INCLUDE_PATH)
-    foreach(directory ${CMAKE_INCLUDE_PATH})
-      set(device_compiler_includes "-I${directory}"
-        ${device_compiler_includes})
-    endforeach()
-  endif()
-
-  # Convert argument list format
-  separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
-
-  # Add custom command for running compute++
-  add_custom_command(
-    OUTPUT ${outputSyclFile}
-    COMMAND ${COMPUTECPP_DEVICE_COMPILER}
-            ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-            -isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
-            ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
-            ${device_compiler_includes}
-            -o ${outputSyclFile}
-            -c ${sourceFile}
-    DEPENDS ${sourceFile}
-    WORKING_DIRECTORY ${binaryDir}
-  COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
-
-  # Add a custom target for the generated integration header
-  add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
-
-  # Add a dependency on the integration header
-  add_dependencies(${targetName} ${targetName}_integration_header)
-
-  # Force inclusion of the integration header for the host compiler
-  set(compileFlags -include ${outputSyclFile} "-Wall")
-  target_compile_options(${targetName} PUBLIC ${compileFlags})
-  
-  # Set the host compiler C++ standard to C++11
-  set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
-
-  # Disable GCC dual ABI on GCC 5.1 and higher
-  if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
-    set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
-      "_GLIBCXX_USE_CXX11_ABI=0")
-  endif()
-
-endfunction()
-
-#######################
-#  add_sycl_to_target
-#######################
-#
-#  Adds a SYCL compilation custom command associated with an existing
-#  target and sets a dependancy on that new command.
-#
-#  targetName : Name of the target to add a SYCL to.
-#  sourceFile : Source file to be compiled for SYCL.
-#  binaryDir : Intermediate directory to output the integration header.
-#
-function(add_sycl_to_target targetName sourceFile binaryDir)
-
-  # Add custom target to run compute++ and generate the integration header
-  __build_spir(${targetName} ${sourceFile} ${binaryDir})
-
-  # Link with the ComputeCpp runtime library
-  target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
-                        PUBLIC ${OpenCL_LIBRARIES})
-
-endfunction(add_sycl_to_target)
-

From e926cfd3bb107b83b4b6d271bda9c0c96eeb39e8 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:20:55 +0000
Subject: [PATCH 21/76] Remove CMake from gitignore

---
 .gitignore | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6ef20a5..b73a688 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,4 @@ gpu-stream-sycl
 
 .DS_Store
 
-CMakeCache.txt
-CMakeFiles/
-cmake_install.cmake
 Makefile

From 472434d2fce01ffbffeb08438a2dc6c438519750 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:21:50 +0000
Subject: [PATCH 22/76] Update gitignore binary names

---
 .gitignore | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index b73a688..b239bb2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,11 @@
 
-gpu-stream-cuda
-gpu-stream-ocl
-gpu-stream-acc
-gpu-stream-omp3
-gpu-stream-omp45
-gpu-stream-sycl
+cuda-stream
+ocl-stream
+omp-stream
+acc-stream
+raja-stream
+kokkos-stream
+sycl-stream
 
 *.o
 *.tar

From 0cbf8aa649bf9601a58b2054b93b0a410d79c042 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:32:17 +0000
Subject: [PATCH 23/76] Add COMPILER option to OpenCL Makefile

---
 OpenCL.make | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/OpenCL.make b/OpenCL.make
index f48be82..4c73b16 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -1,4 +1,17 @@
 
+ifndef COMPILER
+$(info Define a compiler to set common defaults, i.e make COMPILER=GNU)
+endif
+
+COMPILER_ = $(CXX)
+COMPILER_GNU = g++
+COMPILER_CRAY = CC
+
+FLAGS_ = -O3
+FLAGS_GNU = -O3 -std=c++11
+FLAGS_CRAY = -O3 -hstd=c++11
+CFLAGS=$(FLAGS_$(COMPILER))
+
 ocl-stream: main.cpp OCLStream.cpp
-	$(CXX) -O3 -std=c++11 -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@
+	$(COMPILER_$(COMPILER)) $(CFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@
 

From 1c31eedd8b29f87c968d407dc918fa4315147bf7 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:41:44 +0000
Subject: [PATCH 24/76] Update README

---
 README.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0d181d5..c55bcd3 100644
--- a/README.md
+++ b/README.md
@@ -23,10 +23,16 @@ Website
 Usage
 -----
 
-CMake 3.2 or above is required.
-Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
+Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
 
-Generate the Makefile with `cmake .`
+We have supplied a series of Makefiles, one for each programming model, to assist with building.
+The Makefiles contain common build options, and should be simple to customise for your needs too.
+
+General usage is `make -f <Model>.make
+Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
+Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.
+
+Pass in extra flags via the `EXTRA_FLAGS` option.
 
 Android (outdated instructions)
 ------------------

From 67411686543e24f2b5d36ab9a4071c09733e47f6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:42:51 +0000
Subject: [PATCH 25/76] Fix README typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c55bcd3..c11f694 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Drivers, compiler and software applicable to whichever implementation you would
 We have supplied a series of Makefiles, one for each programming model, to assist with building.
 The Makefiles contain common build options, and should be simple to customise for your needs too.
 
-General usage is `make -f <Model>.make
+General usage is `make -f <Model>.make`
 Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
 Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.
 

From 2989419cb57881186cbfb448bf1cf4eff495da20 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:43:56 +0000
Subject: [PATCH 26/76] Update README

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index c11f694..552051e 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,9 @@ Some models allow specifying a CPU or GPU style target, and this can be set by p
 
 Pass in extra flags via the `EXTRA_FLAGS` option.
 
+The binaries are named in the form `<model>-stream`.
+
+
 Android (outdated instructions)
 ------------------
 

From e6ac8076cf2be6fcf3bd28bce59a48f3d7913096 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 23 Feb 2017 16:45:55 +0000
Subject: [PATCH 27/76] Move android instructions to seperate file

---
 README.android | 36 ++++++++++++++++++++++++++++++++++++
 README.md      | 36 ------------------------------------
 2 files changed, 36 insertions(+), 36 deletions(-)
 create mode 100644 README.android

diff --git a/README.android b/README.android
new file mode 100644
index 0000000..edc4a52
--- /dev/null
+++ b/README.android
@@ -0,0 +1,36 @@
+Android (outdated instructions)
+------------------
+
+Assuming you have a recent Android NDK available, you can use the
+toolchain that it provides to build GPU-STREAM. You should first
+use the NDK to generate a standalone toolchain:
+
+    # Select a directory to install the toolchain to
+    ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
+
+    ${NDK}/build/tools/make-standalone-toolchain.sh \
+      --platform=android-14 \
+      --toolchain=arm-linux-androideabi-4.8 \
+      --install-dir=${ANDROID_NATIVE_TOOLCHAIN}
+
+Make sure that the OpenCL headers and library (libOpenCL.so) are
+available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
+
+You should then be able to build GPU-STREAM:
+
+    make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
+
+Copy the executable and OpenCL kernels to the device:
+
+    adb push gpu-stream-ocl /data/local/tmp
+    adb push ocl-stream-kernels.cl /data/local/tmp
+
+Run GPU-STREAM from an adb shell:
+
+    adb shell
+    cd /data/local/tmp
+
+    # Use float if device doesn't support double, and reduce array size
+    ./gpu-stream-ocl --float -n 6 -s 10000000
+
+
diff --git a/README.md b/README.md
index 0d181d5..df0f23c 100644
--- a/README.md
+++ b/README.md
@@ -28,42 +28,6 @@ Drivers, compiler and software applicable to whichever implementation you would
 
 Generate the Makefile with `cmake .`
 
-Android (outdated instructions)
-------------------
-
-Assuming you have a recent Android NDK available, you can use the
-toolchain that it provides to build GPU-STREAM. You should first
-use the NDK to generate a standalone toolchain:
-
-    # Select a directory to install the toolchain to
-    ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
-
-    ${NDK}/build/tools/make-standalone-toolchain.sh \
-      --platform=android-14 \
-      --toolchain=arm-linux-androideabi-4.8 \
-      --install-dir=${ANDROID_NATIVE_TOOLCHAIN}
-
-Make sure that the OpenCL headers and library (libOpenCL.so) are
-available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
-
-You should then be able to build GPU-STREAM:
-
-    make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
-
-Copy the executable and OpenCL kernels to the device:
-
-    adb push gpu-stream-ocl /data/local/tmp
-    adb push ocl-stream-kernels.cl /data/local/tmp
-
-Run GPU-STREAM from an adb shell:
-
-    adb shell
-    cd /data/local/tmp
-
-    # Use float if device doesn't support double, and reduce array size
-    ./gpu-stream-ocl --float -n 6 -s 10000000
-
-
 Results
 -------
 

From f3abd6657872e28e949e3c6b57d8409d210dbaf6 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 17:24:14 +0000
Subject: [PATCH 28/76] Make Kokkos build without intermediate objects

---
 Kokkos.make | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/Kokkos.make b/Kokkos.make
index 78e2eaa..f0b4528 100644
--- a/Kokkos.make
+++ b/Kokkos.make
@@ -9,20 +9,16 @@ TARGET=CPU
 endif
 
 ifeq ($(TARGET), CPU)
-COMPILER = $(CXX)
 TARGET_DEF = -DKOKKOS_TARGET_CPU
 else ifeq ($(TARGET), GPU)
-COMPILER = $(NVCC_WRAPPER)
+CXX = $(NVCC_WRAPPER)
 TARGET_DEF =
 endif
 
-kokkos-stream: main.o KOKKOSStream.o
-	$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
-
-%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
-	$(COMPILER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
+kokkos-stream: main.cpp KOKKOSStream.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(KOKKOS_LDFLAGS) main.cpp KOKKOSStream.cpp $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
 
 .PHONY: clean
 clean:
-	rm -f main.o KOKKOSStream.o
+	rm -f kokkos-stream
 

From cdea91abc66436f1ab317f3414177b577612371e Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 17:49:46 +0000
Subject: [PATCH 29/76] Add required ARCH flag for RAJA on GPUs

---
 RAJA.make | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/RAJA.make b/RAJA.make
index 0f9cd61..5851e4f 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -24,7 +24,11 @@ endif
 
 else ifeq ($(TARGET), GPU)
 COMP = nvcc
-CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp
+
+ifndef ARCH
+$(error No ARCH defined. Specify target GPU architecture (e.g. ARCH=sm_35))
+endif
+CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
 endif
 
 raja-stream: main.cpp RAJAStream.cpp

From 063446b6a79d4527f5dcbb9a114966171b0ae9d6 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 18:15:27 +0000
Subject: [PATCH 30/76] Add EXTRA_FLAGS to SYCL.make

---
 SYCL.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SYCL.make b/SYCL.make
index 5dde5c8..02f9442 100644
--- a/SYCL.make
+++ b/SYCL.make
@@ -1,6 +1,6 @@
 
 sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
-	$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl -lComputeCpp -lOpenCL -o $@
+	$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@
 
 
 SYCLStream.sycl: SYCLStream.cpp

From d78baa4c2bb580979139285ec18bca8b80dfffd1 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 18:39:40 +0000
Subject: [PATCH 31/76] Use CXXFLAGS everywhere for consistency

---
 OpenACC.make |  4 ++--
 OpenCL.make  |  4 ++--
 RAJA.make    | 13 ++++++-------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/OpenACC.make b/OpenACC.make
index 061a9e0..161c20e 100644
--- a/OpenACC.make
+++ b/OpenACC.make
@@ -23,8 +23,8 @@ FLAGS_PGI += -ta=multicore
 endif
 
 FLAGS_CRAY = -hstd=c++11
-CFLAGS = $(FLAGS_$(COMPILER))
+CXXFLAGS = $(FLAGS_$(COMPILER))
 
 acc-stream: main.cpp ACCStream.cpp
-	$(CXX) $(CFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
+	$(CXX) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
 
diff --git a/OpenCL.make b/OpenCL.make
index 4c73b16..ce7db6e 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -10,8 +10,8 @@ COMPILER_CRAY = CC
 FLAGS_ = -O3
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
-CFLAGS=$(FLAGS_$(COMPILER))
+CXXFLAGS=$(FLAGS_$(COMPILER))
 
 ocl-stream: main.cpp OCLStream.cpp
-	$(COMPILER_$(COMPILER)) $(CFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@
+	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@
 
diff --git a/RAJA.make b/RAJA.make
index 5851e4f..3f456cb 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -6,20 +6,20 @@ endif
 
 ifeq ($(TARGET), CPU)
 COMP=$(CXX)
-CFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU 
+CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU
 
 ifndef COMPILER
 $(error No COMPILER defined. Specify COMPILER for correct OpenMP flag.)
 endif
 ifeq ($(COMPILER), INTEL)
 COMP = icpc
-CFLAGS += -qopenmp
+CXXFLAGS += -qopenmp
 else ifeq ($(COMPILER), GNU)
 COMP = g++
-CFLAGS += -fopenmp
+CXXFLAGS += -fopenmp
 else ifeq ($(COMPILER), CRAY)
 COMP = CC
-CFLAGS +=
+CXXFLAGS +=
 endif
 
 else ifeq ($(TARGET), GPU)
@@ -28,9 +28,8 @@ COMP = nvcc
 ifndef ARCH
 $(error No ARCH defined. Specify target GPU architecture (e.g. ARCH=sm_35))
 endif
-CFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
+CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
 endif
 
 raja-stream: main.cpp RAJAStream.cpp
-	$(COMP) $(CFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
-
+	$(COMP) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@

From ce4f49e08b37e71fe56b5d91d3f439e7afdeb6d9 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 19:08:25 +0000
Subject: [PATCH 32/76] Add dot kernel to HIP implementation

---
 HIPStream.cu | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 HIPStream.h  |  6 ++++++
 2 files changed, 55 insertions(+)

diff --git a/HIPStream.cu b/HIPStream.cu
index 8c02348..d14fe84 100644
--- a/HIPStream.cu
+++ b/HIPStream.cu
@@ -9,6 +9,7 @@
 #include "hip/hip_runtime.h"
 
 #define TBSIZE 1024
+#define DOT_NUM_BLOCKS 256
 
 void check_error(void)
 {
@@ -47,6 +48,9 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
 
   array_size = ARRAY_SIZE;
 
+  // Allocate the host array for partial sums for dot kernels
+  sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
+
   // Check buffers fit on the device
   hipDeviceProp_t props;
   hipGetDeviceProperties(&props, 0);
@@ -60,6 +64,8 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
   check_error();
   hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
   check_error();
+  hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
+  check_error();
 }
 
 
@@ -172,6 +178,49 @@ void HIPStream<T>::triad()
 }
 
 
+template <class T>
+__global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
+{
+
+  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+  T *tb_sum = reinterpret_cast<T*>(smem);
+
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  const size_t local_i = threadIdx.x;
+
+  tb_sum[local_i] = 0.0;
+  for (; i < array_size; i += blockDim.x*gridDim.x)
+    tb_sum[local_i] += a[i] * b[i];
+
+  for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
+  {
+    __syncthreads();
+    if (local_i < offset)
+    {
+      tb_sum[local_i] += tb_sum[local_i+offset];
+    }
+  }
+
+  if (local_i == 0)
+    sum[blockIdx.x] = tb_sum[local_i];
+}
+
+template <class T>
+T HIPStream<T>::dot()
+{
+  hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), sizeof(T)*TBSIZE, 0, d_a, d_b, d_sum, array_size);
+  check_error();
+
+  hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
+  check_error();
+
+  T sum = 0.0;
+  for (int i = 0; i < DOT_NUM_BLOCKS; i++)
+    sum += sums[i];
+
+  return sum;
+}
+
 void listDevices(void)
 {
   // Get number of devices
diff --git a/HIPStream.h b/HIPStream.h
index 392080a..9209388 100644
--- a/HIPStream.h
+++ b/HIPStream.h
@@ -21,10 +21,15 @@ class HIPStream : public Stream<T>
   protected:
     // Size of arrays
     unsigned int array_size;
+
+    // Host array for partial sums for dot kernel
+    T *sums;
+
     // Device side pointers to arrays
     T *d_a;
     T *d_b;
     T *d_c;
+    T *d_sum;
 
 
   public:
@@ -36,6 +41,7 @@ class HIPStream : public Stream<T>
     virtual void add() override;
     virtual void mul() override;
     virtual void triad() override;
+    virtual T dot() override;
 
     virtual void init_arrays(T initA, T initB, T initC) override;
     virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

From 13f5c837f690ed684d5bfce5bac536388b9c22ac Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 19:19:47 +0000
Subject: [PATCH 33/76] Add HIP.make with CUDA support

Still needs additional work for HIP via HCC.
---
 .gitignore |  1 +
 HIP.make   | 13 +++++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 HIP.make

diff --git a/.gitignore b/.gitignore
index b239bb2..5dad665 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ acc-stream
 raja-stream
 kokkos-stream
 sycl-stream
+hip-stream
 
 *.o
 *.tar
diff --git a/HIP.make b/HIP.make
new file mode 100644
index 0000000..25460fe
--- /dev/null
+++ b/HIP.make
@@ -0,0 +1,13 @@
+
+# TODO: HIP with HCC
+
+HIPCC = hipcc
+
+ifndef CUDA_PATH
+ifeq (,$(wildcard /usr/local/bin/nvcc))
+$(error /usr/local/bin/nvcc not found, set CUDA_PATH instead)
+endif
+endif
+
+hip-stream: main.cpp HIPStream.cu
+	$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@

From b62c93527da6c436e79dc4c76d05b7528e729107 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 19:26:55 +0000
Subject: [PATCH 34/76] Use computecpp_info for SYCL device compiler flags

---
 SYCL.make | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/SYCL.make b/SYCL.make
index 02f9442..1e1c480 100644
--- a/SYCL.make
+++ b/SYCL.make
@@ -1,11 +1,12 @@
 
+COMPUTECPP_FLAGS = $(shell computecpp_info --dump-device-compiler-flags)
+
 sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
 	$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@
 
-
 SYCLStream.sycl: SYCLStream.cpp
-	compute++ SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c
+	compute++ SYCLStream.cpp $(COMPUTECPP_FLAGS) -c
 
 .PHONY: clean
 clean:
-	rm -f SYCLStream.sycl
+	rm -f sycl-stream SYCLStream.sycl SYCLStream.bc

From b485e77ef30190bb7c4ce4eaefcb8ae1464ff769 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Thu, 23 Feb 2017 19:38:14 +0000
Subject: [PATCH 35/76] Add SYCL intermediate outputs to .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5dad665..a9748b3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@ sycl-stream
 hip-stream
 
 *.o
+*.bc
+*.sycl
 *.tar
 *.gz
 

From e1ea01f0c2807ac4717e87bfd80f21c029856fe0 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 11:02:50 +0000
Subject: [PATCH 36/76] Add -xHOST to Intel for OpenMP

---
 OpenMP.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/OpenMP.make b/OpenMP.make
index bc4e955..d4b7519 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -12,7 +12,7 @@ CXX = $(COMPILER_$(COMPILER))
 
 FLAGS_ = -O3
 FLAGS_GNU = -O3 -std=c++11
-FLAGS_INTEL = -O3 -std=c++11
+FLAGS_INTEL = -O3 -std=c++11 -xHOST
 FLAGS_CRAY = -O3 -hstd=c++11
 FLAGS_CLANG = -O3 -std=c++11
 CXXFLAGS = $(FLAGS_$(COMPILER))

From cbf15e8954af4d5d1e1d0e6647f60cafff28213d Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 11:50:25 +0000
Subject: [PATCH 37/76] Have -std=c++11 as default flags when COMPILER is not
 set for ACC, CL and MP

---
 OpenACC.make | 2 +-
 OpenCL.make  | 2 +-
 OpenMP.make  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/OpenACC.make b/OpenACC.make
index 161c20e..2799603 100644
--- a/OpenACC.make
+++ b/OpenACC.make
@@ -8,7 +8,7 @@ COMPILER_PGI = pgc++
 COMPILER_CRAY = CC
 CXX = $(COMPILER_$(COMPILER))
 
-FLAGS_ = -O3
+FLAGS_ = -O3 -std=c++11
 
 FLAGS_PGI = -std=c++11 -O3 -acc
 ifeq ($(COMPILER), PGI)
diff --git a/OpenCL.make b/OpenCL.make
index ce7db6e..62c4f43 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -7,7 +7,7 @@ COMPILER_ = $(CXX)
 COMPILER_GNU = g++
 COMPILER_CRAY = CC
 
-FLAGS_ = -O3
+FLAGS_ = -O3 -std=c++11
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
 CXXFLAGS=$(FLAGS_$(COMPILER))
diff --git a/OpenMP.make b/OpenMP.make
index d4b7519..2fd47e7 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -10,7 +10,7 @@ COMPILER_CRAY = CC
 COMPILER_CLANG = clang++
 CXX = $(COMPILER_$(COMPILER))
 
-FLAGS_ = -O3
+FLAGS_ = -O3 -std=c++11
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11 -xHOST
 FLAGS_CRAY = -O3 -hstd=c++11

From 761ba2dce0e8318f46b4b8357f5427da3b0082b1 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 11:57:34 +0000
Subject: [PATCH 38/76] Add a clean option to all Makefiles

---
 CUDA.make    | 4 ++++
 HIP.make     | 5 +++++
 OpenACC.make | 4 ++++
 OpenCL.make  | 4 ++++
 OpenMP.make  | 5 +++++
 RAJA.make    | 7 ++++++-
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/CUDA.make b/CUDA.make
index 9a7ec88..ef193df 100644
--- a/CUDA.make
+++ b/CUDA.make
@@ -2,3 +2,7 @@
 cuda-stream: main.cpp CUDAStream.cu
 	nvcc -std=c++11 -O3 -DCUDA $^ $(EXTRA_FLAGS) -o $@
 
+.PHONY: clean
+clean:
+	rm -f cuda-stream
+
diff --git a/HIP.make b/HIP.make
index 25460fe..bbd142e 100644
--- a/HIP.make
+++ b/HIP.make
@@ -11,3 +11,8 @@ endif
 
 hip-stream: main.cpp HIPStream.cu
 	$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f hip-stream
+
diff --git a/OpenACC.make b/OpenACC.make
index 2799603..d9fbc36 100644
--- a/OpenACC.make
+++ b/OpenACC.make
@@ -28,3 +28,7 @@ CXXFLAGS = $(FLAGS_$(COMPILER))
 acc-stream: main.cpp ACCStream.cpp
 	$(CXX) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
 
+.PHONY: clean
+clean:
+	rm -f acc-stream
+
diff --git a/OpenCL.make b/OpenCL.make
index 62c4f43..0075424 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -15,3 +15,7 @@ CXXFLAGS=$(FLAGS_$(COMPILER))
 ocl-stream: main.cpp OCLStream.cpp
 	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@
 
+.PHONY: clean
+clean:
+	rm -f ocl-stream
+
diff --git a/OpenMP.make b/OpenMP.make
index 2fd47e7..0757f0d 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -37,3 +37,8 @@ omp-stream: main.cpp OMPStream.cpp
 omp-target-stream: main.cpp OMPStream.cpp
 	$(CXX) $(CXXFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@
 
+.PHONY: clean
+clean:
+	rm -f omp-stream
+	rm -f omp-target-stream
+
diff --git a/RAJA.make b/RAJA.make
index 3f456cb..bcfbba7 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -9,7 +9,7 @@ COMP=$(CXX)
 CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU
 
 ifndef COMPILER
-$(error No COMPILER defined. Specify COMPILER for correct OpenMP flag.)
+$(info No COMPILER defined. Specify COMPILER for correct OpenMP flag.)
 endif
 ifeq ($(COMPILER), INTEL)
 COMP = icpc
@@ -33,3 +33,8 @@ endif
 
 raja-stream: main.cpp RAJAStream.cpp
 	$(COMP) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
+
+.PHONY: clean
+clean:
+	rm -f raja-stream
+

From 10b2376634e512f9e0ef06ff6b0c39de4a5cddca Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 12:24:35 +0000
Subject: [PATCH 39/76] Add TARGET option to OpenMP Makefile

---
 OpenMP.make | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/OpenMP.make b/OpenMP.make
index 0757f0d..cbcca54 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -3,6 +3,11 @@ ifndef COMPILER
 $(info Define a compiler to set common defaults, i.e make COMPILER=GNU)
 endif
 
+ifndef TARGET
+$(info No target defined. Specify CPU or GPU. Defaulting to CPU)
+TARGET=CPU
+endif
+
 COMPILER_ = $(CXX)
 COMPILER_GNU = g++
 COMPILER_INTEL = icpc
@@ -31,14 +36,17 @@ OMP_TARGET_CRAY  =
 OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
 OMP_TARGET = $(OMP_TARGET_$(COMPILER))
 
+ifeq ($(TARGET), CPU)
+OMP = $(OMP_$(COMPILER))
+else ifeq ($(TARGET), GPU)
+OMP = $(OMP_TARGET_$(COMPILER))
+OMP += -DOMP_TARGET_GPU
+endif
+
 omp-stream: main.cpp OMPStream.cpp
 	$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
 
-omp-target-stream: main.cpp OMPStream.cpp
-	$(CXX) $(CXXFLAGS) -DOMP -DOMP_TARGET_GPU $^ $(OMP_TARGET) $(EXTRA_FLAGS) -o $@
-
 .PHONY: clean
 clean:
 	rm -f omp-stream
-	rm -f omp-target-stream
 

From 282251f0261d2f543f075ccc2bc36807e715c98c Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 12:48:00 +0000
Subject: [PATCH 40/76] Add options for specific targets in OpenACC.make

---
 OpenACC.make | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/OpenACC.make b/OpenACC.make
index 161c20e..af27481 100644
--- a/OpenACC.make
+++ b/OpenACC.make
@@ -6,25 +6,41 @@ endif
 COMPILER_ = $(CXX)
 COMPILER_PGI = pgc++
 COMPILER_CRAY = CC
-CXX = $(COMPILER_$(COMPILER))
 
 FLAGS_ = -O3
 
 FLAGS_PGI = -std=c++11 -O3 -acc
 ifeq ($(COMPILER), PGI)
+define target_help
+Set a TARGET to ensure PGI targets the correct offload device.
+Available targets are:
+  SNB, IVB, HSW, BDW, KNL
+  KEPLER, MAXWELL, PASCAL
+  HAWAII
+endef
 ifndef TARGET
-$(info Set a TARGET to ensure PGI targets the correct offload device. i.e. TARGET=GPU or CPU)
+$(error $(target_help))
 endif
+TARGET_FLAGS_SNB     = -ta=multicore -tp=sandybridge
+TARGET_FLAGS_IVB     = -ta=multicore -tp=ivybridge
+TARGET_FLAGS_HSW     = -ta=multicore -tp=haswell
+TARGET_FLAGS_BDW     = -ta=multicore -tp=haswell # Not yet supported directly
+TARGET_FLAGS_KNL     = -ta=multicore -tp=haswell # Not yet supported directly
+TARGET_FLAGS_KEPLER  = -ta=nvidia:cc35
+TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
+TARGET_FLAGS_PASCAL  = -ta=nvidia:cc60
+TARGET_FLAGS_HAWAII  = -ta=radeon:hawaii
+ifeq ($(TARGET_FLAGS_$(TARGET)),)
+$(error $(target_help))
 endif
-ifeq ($(TARGET), GPU)
-FLAGS_PGI += -ta=nvidia
-else ifeq ($(TARGET), CPU)
-FLAGS_PGI += -ta=multicore
+
+FLAGS_PGI += $(TARGET_FLAGS_$(TARGET))
+
 endif
 
 FLAGS_CRAY = -hstd=c++11
 CXXFLAGS = $(FLAGS_$(COMPILER))
 
 acc-stream: main.cpp ACCStream.cpp
-	$(CXX) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
+	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
 

From 91131baf3d5777b13e4cd50307464fa863fde227 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 12:52:13 +0000
Subject: [PATCH 41/76] Remove BDW and KNL targets from OpenACC as not yet
 supported by PGI

---
 OpenACC.make | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/OpenACC.make b/OpenACC.make
index 6fecf1c..9ec5c4d 100644
--- a/OpenACC.make
+++ b/OpenACC.make
@@ -14,7 +14,7 @@ ifeq ($(COMPILER), PGI)
 define target_help
 Set a TARGET to ensure PGI targets the correct offload device.
 Available targets are:
-  SNB, IVB, HSW, BDW, KNL
+  SNB, IVB, HSW
   KEPLER, MAXWELL, PASCAL
   HAWAII
 endef
@@ -24,8 +24,6 @@ endif
 TARGET_FLAGS_SNB     = -ta=multicore -tp=sandybridge
 TARGET_FLAGS_IVB     = -ta=multicore -tp=ivybridge
 TARGET_FLAGS_HSW     = -ta=multicore -tp=haswell
-TARGET_FLAGS_BDW     = -ta=multicore -tp=haswell # Not yet supported directly
-TARGET_FLAGS_KNL     = -ta=multicore -tp=haswell # Not yet supported directly
 TARGET_FLAGS_KEPLER  = -ta=nvidia:cc35
 TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
 TARGET_FLAGS_PASCAL  = -ta=nvidia:cc60

From bbdd5b9fcba3caf81a1b64387c85034761925abb Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:07:35 +0000
Subject: [PATCH 42/76] Add help message to Kokkos TARGET variable

---
 Kokkos.make | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Kokkos.make b/Kokkos.make
index f0b4528..4bba1c8 100644
--- a/Kokkos.make
+++ b/Kokkos.make
@@ -4,7 +4,13 @@ default: kokkos-stream
 include $(KOKKOS_PATH)/Makefile.kokkos
 
 ifndef TARGET
-$(info No target defined. Specify CPU or GPU. Defaulting to CPU)
+define target_help
+Set TARGET to change to offload device. Defaulting to CPU.
+Available targets are:
+  CPU (default)
+  GPU
+endef
+$(info $(target_help))
 TARGET=CPU
 endif
 

From 3be4ebc1a2e88c2ce95be330449b84956632fb3e Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:11:07 +0000
Subject: [PATCH 43/76] Add help messages to RAJA Makefile

---
 RAJA.make | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/RAJA.make b/RAJA.make
index bcfbba7..bba45f5 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -1,6 +1,12 @@
 
 ifndef TARGET
-$(info No target defined. Specify CPU or GPU. Defaulting to CPU)
+define target_help
+Set TARGET to change to offload device. Defaulting to CPU.
+Available targets are:
+  CPU (default)
+  GPU
+endef
+$(info $(target_help))
 TARGET=CPU
 endif
 
@@ -9,7 +15,12 @@ COMP=$(CXX)
 CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU
 
 ifndef COMPILER
-$(info No COMPILER defined. Specify COMPILER for correct OpenMP flag.)
+define compiler_help
+Set COMPILER to ensure correct OpenMP flags are set.
+Available compilers are:
+  INTEL GNU CRAY
+endef
+$(info $(compiler_help))
 endif
 ifeq ($(COMPILER), INTEL)
 COMP = icpc
@@ -26,7 +37,12 @@ else ifeq ($(TARGET), GPU)
 COMP = nvcc
 
 ifndef ARCH
-$(error No ARCH defined. Specify target GPU architecture (e.g. ARCH=sm_35))
+define arch_help
+Set ARCH to ensure correct GPU architecture.
+Example:
+  ARCH=sm_35
+endef
+$(error $(arch_help))
 endif
 CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
 endif

From c470b88deede18ece614c09420df5070570ee7be Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:13:08 +0000
Subject: [PATCH 44/76] Add compiler help text to OpenACC

---
 OpenACC.make | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/OpenACC.make b/OpenACC.make
index 9ec5c4d..504f069 100644
--- a/OpenACC.make
+++ b/OpenACC.make
@@ -1,6 +1,11 @@
 
 ifndef COMPILER
-$(info Define a compiler to set common defaults, i.e make COMPILER=GNU)
+define compiler_help
+Set COMPILER to ensure correct flags are set.
+Available compilers are:
+  PGI CRAY
+endef
+$(info $(compiler_help))
 endif
 
 COMPILER_ = $(CXX)

From 6008f8c5364907ce567dfffcc068f362b27a0c11 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:03:06 +0000
Subject: [PATCH 45/76] Add intermediate objects to OpenACC clean rule

PGI creates these, even though we don't ask for them.
---
 OpenACC.make | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/OpenACC.make b/OpenACC.make
index 504f069..cc61f8e 100644
--- a/OpenACC.make
+++ b/OpenACC.make
@@ -49,5 +49,4 @@ acc-stream: main.cpp ACCStream.cpp
 
 .PHONY: clean
 clean:
-	rm -f acc-stream
-
+	rm -f acc-stream main.o ACCStream.o

From 82de8188558a0792d29f35c9c7255464937ee614 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:08:20 +0000
Subject: [PATCH 46/76] Add support for Intel as host compiler for OpenCL

---
 OpenCL.make | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/OpenCL.make b/OpenCL.make
index 0075424..f43ebfa 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -5,10 +5,12 @@ endif
 
 COMPILER_ = $(CXX)
 COMPILER_GNU = g++
+COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
 
 FLAGS_ = -O3 -std=c++11
 FLAGS_GNU = -O3 -std=c++11
+FLAGS_INTEL = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
 CXXFLAGS=$(FLAGS_$(COMPILER))
 

From 8fee86a23230231dc2de5143c1c6b5c46dc6576a Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:17:12 +0000
Subject: [PATCH 47/76] Add compiler help to OpenCL.make

---
 OpenCL.make | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/OpenCL.make b/OpenCL.make
index f43ebfa..244398a 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -1,15 +1,22 @@
 
 ifndef COMPILER
-$(info Define a compiler to set common defaults, i.e make COMPILER=GNU)
+define compiler_help
+Set COMPILER to ensure correct flags are set.
+Available compilers are:
+  GNU CLANG INTEL CRAY
+endef
+$(info $(compiler_help))
 endif
 
 COMPILER_ = $(CXX)
 COMPILER_GNU = g++
+COMPILER_CLANG = clang++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
 
 FLAGS_ = -O3 -std=c++11
 FLAGS_GNU = -O3 -std=c++11
+FLAGS_CLANG = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
 CXXFLAGS=$(FLAGS_$(COMPILER))

From 1aec057e48b27286beb9adf94c0a85e630feeaea Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:32:59 +0000
Subject: [PATCH 48/76] Add help messages to OpenMP.make and refactor

---
 OpenMP.make | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/OpenMP.make b/OpenMP.make
index cbcca54..4c2fe07 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -1,52 +1,57 @@
 
 ifndef COMPILER
-$(info Define a compiler to set common defaults, i.e make COMPILER=GNU)
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  CLANG CRAY GNU INTEL
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
 endif
 
 ifndef TARGET
-$(info No target defined. Specify CPU or GPU. Defaulting to CPU)
+define target_help
+Set TARGET to change device (defaulting to CPU).
+Available targets are:
+  CPU NVIDIA
+
+endef
+$(info $(target_help))
 TARGET=CPU
 endif
 
-COMPILER_ = $(CXX)
 COMPILER_GNU = g++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
 COMPILER_CLANG = clang++
 CXX = $(COMPILER_$(COMPILER))
 
-FLAGS_ = -O3 -std=c++11
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11 -xHOST
 FLAGS_CRAY = -O3 -hstd=c++11
 FLAGS_CLANG = -O3 -std=c++11
 CXXFLAGS = $(FLAGS_$(COMPILER))
 
-OMP_ =
-OMP_GNU   = -fopenmp
-OMP_INTEL = -qopenmp
-OMP_CRAY  =
-OMP_CLANG = -fopenmp=libomp
-OMP = $(OMP_$(COMPILER))
+# OpenMP flags for CPUs
+OMP_GNU_CPU   = -fopenmp
+OMP_INTEL_CPU = -qopenmp
+OMP_CRAY_CPU  =
+OMP_CLANG_CPU = -fopenmp=libomp
 
-OMP_TARGET_ =
-OMP_TARGET_GNU   = -fopenmp
-OMP_TARGET_INTEL =
-OMP_TARGET_CRAY  =
-OMP_TARGET_CLANG = -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
-OMP_TARGET = $(OMP_TARGET_$(COMPILER))
+# OpenMP flags for NVIDIA
+OMP_CRAY_NVIDIA  = -DOMP_TARGET_GPU
+OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
 
-ifeq ($(TARGET), CPU)
-OMP = $(OMP_$(COMPILER))
-else ifeq ($(TARGET), GPU)
-OMP = $(OMP_TARGET_$(COMPILER))
-OMP += -DOMP_TARGET_GPU
+ifndef OMP_$(COMPILER)_$(TARGET)
+$(error Targeting $(TARGET) with $(COMPILER) not supported)
 endif
 
+OMP = $(OMP_$(COMPILER)_$(TARGET))
+
 omp-stream: main.cpp OMPStream.cpp
 	$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
 
 .PHONY: clean
 clean:
 	rm -f omp-stream
-

From a7d7998326b6cface59803195941758f92e942e4 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 13:40:54 +0000
Subject: [PATCH 49/76] Use -framework OpenCL on Darwin

---
 OpenCL.make | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/OpenCL.make b/OpenCL.make
index 244398a..484921d 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -21,8 +21,15 @@ FLAGS_INTEL = -O3 -std=c++11
 FLAGS_CRAY = -O3 -hstd=c++11
 CXXFLAGS=$(FLAGS_$(COMPILER))
 
+PLATFORM = $(shell uname -s)
+ifeq ($(PLATFORM), Darwin)
+  LIBS = -framework OpenCL
+else
+  LIBS = -lOpenCL
+endif
+
 ocl-stream: main.cpp OCLStream.cpp
-	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) -lOpenCL -o $@
+	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
 
 .PHONY: clean
 clean:

From 569cfa1d31a3a2a4af089514494b35b54419c609 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 09:02:51 -0600
Subject: [PATCH 50/76] Make Cray OpenMP flag non-empty to fix error

---
 OpenMP.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/OpenMP.make b/OpenMP.make
index 4c2fe07..32f46ca 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -36,7 +36,7 @@ CXXFLAGS = $(FLAGS_$(COMPILER))
 # OpenMP flags for CPUs
 OMP_GNU_CPU   = -fopenmp
 OMP_INTEL_CPU = -qopenmp
-OMP_CRAY_CPU  =
+OMP_CRAY_CPU  = -homp
 OMP_CLANG_CPU = -fopenmp=libomp
 
 # OpenMP flags for NVIDIA

From dfe5503cba4bf2b83c2bcf87f9a339ff4f625a99 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 09:33:59 -0600
Subject: [PATCH 51/76] Allow user to override CXX in OpenCL.make

---
 OpenCL.make | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/OpenCL.make b/OpenCL.make
index 484921d..8ad7108 100644
--- a/OpenCL.make
+++ b/OpenCL.make
@@ -1,18 +1,20 @@
 
 ifndef COMPILER
 define compiler_help
-Set COMPILER to ensure correct flags are set.
+Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
   GNU CLANG INTEL CRAY
+
 endef
 $(info $(compiler_help))
+COMPILER=GNU
 endif
 
-COMPILER_ = $(CXX)
 COMPILER_GNU = g++
 COMPILER_CLANG = clang++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
+CXX = $(COMPILER_$(COMPILER))
 
 FLAGS_ = -O3 -std=c++11
 FLAGS_GNU = -O3 -std=c++11
@@ -29,7 +31,7 @@ else
 endif
 
 ocl-stream: main.cpp OCLStream.cpp
-	$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
+	$(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
 
 .PHONY: clean
 clean:

From 050a27ca833345f73f50182d7d35c7bd1eb9a87b Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 24 Feb 2017 17:37:30 +0000
Subject: [PATCH 52/76] Add XL compiler support to OpenMP and RAJA makefiles

---
 OpenMP.make |  5 ++++-
 RAJA.make   | 13 ++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/OpenMP.make b/OpenMP.make
index 32f46ca..28b5326 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -3,7 +3,7 @@ ifndef COMPILER
 define compiler_help
 Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
-  CLANG CRAY GNU INTEL
+  CLANG CRAY GNU INTEL XL
 
 endef
 $(info $(compiler_help))
@@ -25,12 +25,14 @@ COMPILER_GNU = g++
 COMPILER_INTEL = icpc
 COMPILER_CRAY = CC
 COMPILER_CLANG = clang++
+COMPILER_XL = xlc++
 CXX = $(COMPILER_$(COMPILER))
 
 FLAGS_GNU = -O3 -std=c++11
 FLAGS_INTEL = -O3 -std=c++11 -xHOST
 FLAGS_CRAY = -O3 -hstd=c++11
 FLAGS_CLANG = -O3 -std=c++11
+FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11
 CXXFLAGS = $(FLAGS_$(COMPILER))
 
 # OpenMP flags for CPUs
@@ -38,6 +40,7 @@ OMP_GNU_CPU   = -fopenmp
 OMP_INTEL_CPU = -qopenmp
 OMP_CRAY_CPU  = -homp
 OMP_CLANG_CPU = -fopenmp=libomp
+OMP_XL_CPU = -qsmp=omp -qthreaded
 
 # OpenMP flags for NVIDIA
 OMP_CRAY_NVIDIA  = -DOMP_TARGET_GPU
diff --git a/RAJA.make b/RAJA.make
index bba45f5..01e807a 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -12,25 +12,28 @@ endif
 
 ifeq ($(TARGET), CPU)
 COMP=$(CXX)
-CXXFLAGS = -O3 -std=c++11 -DRAJA_TARGET_CPU
+CXXFLAGS = -DRAJA_TARGET_CPU
 
 ifndef COMPILER
 define compiler_help
 Set COMPILER to ensure correct OpenMP flags are set.
 Available compilers are:
-  INTEL GNU CRAY
+  INTEL GNU CRAY XL
 endef
 $(info $(compiler_help))
 endif
 ifeq ($(COMPILER), INTEL)
 COMP = icpc
-CXXFLAGS += -qopenmp
+CXXFLAGS += -O3 -std=c++11 -qopenmp
 else ifeq ($(COMPILER), GNU)
 COMP = g++
-CXXFLAGS += -fopenmp
+CXXFLAGS += -O3 -std=c++11 -fopenmp
 else ifeq ($(COMPILER), CRAY)
 COMP = CC
-CXXFLAGS +=
+CXXFLAGS += -O3 -hstd=c++11
+else ifeq ($(COMPILER), XL)
+COMP = xlc++
+CXXFLAGS += -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
 endif
 
 else ifeq ($(TARGET), GPU)

From 24167272398f8b1ce5bc6cfb466660000d9ae23e Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Fri, 24 Feb 2017 22:28:16 +0000
Subject: [PATCH 53/76] Refactor compiler flag handling in RAJA Makefile

---
 RAJA.make | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/RAJA.make b/RAJA.make
index 01e807a..7fb821b 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -11,33 +11,32 @@ TARGET=CPU
 endif
 
 ifeq ($(TARGET), CPU)
-COMP=$(CXX)
-CXXFLAGS = -DRAJA_TARGET_CPU
 
 ifndef COMPILER
 define compiler_help
-Set COMPILER to ensure correct OpenMP flags are set.
+Set COMPILER to change flags (defaulting to GNU).
 Available compilers are:
   INTEL GNU CRAY XL
 endef
 $(info $(compiler_help))
-endif
-ifeq ($(COMPILER), INTEL)
-COMP = icpc
-CXXFLAGS += -O3 -std=c++11 -qopenmp
-else ifeq ($(COMPILER), GNU)
-COMP = g++
-CXXFLAGS += -O3 -std=c++11 -fopenmp
-else ifeq ($(COMPILER), CRAY)
-COMP = CC
-CXXFLAGS += -O3 -hstd=c++11
-else ifeq ($(COMPILER), XL)
-COMP = xlc++
-CXXFLAGS += -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
+COMPILER=GNU
 endif
 
+CXX_INTEL = icpc
+CXX_GNU   = g++
+CXX_CRAY  = CC
+CXX_XL    = xlc++
+
+CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp
+CXXFLAGS_GNU   = -O3 -std=c++11 -fopenmp
+CXXFLAGS_CRAY  = -O3 -hstd=c++11
+CXXFLAGS_XL    = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
+
+CXX = $(CXX_$(COMPILER))
+CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER))
+
 else ifeq ($(TARGET), GPU)
-COMP = nvcc
+CXX = nvcc
 
 ifndef ARCH
 define arch_help
@@ -51,7 +50,7 @@ CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch
 endif
 
 raja-stream: main.cpp RAJAStream.cpp
-	$(COMP) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
+	$(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
 
 .PHONY: clean
 clean:

From cc90cefeeba426187a18ea49a6db15531ba308c7 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Sat, 25 Feb 2017 14:14:59 +0000
Subject: [PATCH 54/76] Minor version bump to signal build system update

---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index 2b6e459..e6f2cbd 100644
--- a/main.cpp
+++ b/main.cpp
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>
 
-#define VERSION_STRING "3.0"
+#define VERSION_STRING "3.1"
 
 #include "Stream.h"
 

From ee7cd066ac50a6a270abc4669a874ba44a946383 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 28 Feb 2017 10:03:23 +0100
Subject: [PATCH 55/76] renamed HIPStream implementation

---
 HIPStream.cu => HIPStream.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)
 rename HIPStream.cu => HIPStream.cpp (93%)

diff --git a/HIPStream.cu b/HIPStream.cpp
similarity index 93%
rename from HIPStream.cu
rename to HIPStream.cpp
index d14fe84..edbcac1 100644
--- a/HIPStream.cu
+++ b/HIPStream.cpp
@@ -1,3 +1,5 @@
+
+
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
@@ -72,14 +74,19 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
 template <class T>
 HIPStream<T>::~HIPStream()
 {
+  free(sums);
+
   hipFree(d_a);
   check_error();
   hipFree(d_b);
   check_error();
   hipFree(d_c);
   check_error();
+  hipFree(d_sum);
+  check_error();
 }
 
+
 template <typename T>
 __global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
 {
@@ -177,22 +184,22 @@ void HIPStream<T>::triad()
   check_error();
 }
 
-
 template <class T>
 __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
 {
 
-  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+  //extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+  HIP_DYNAMIC_SHARED(unsigned char,smem);
   T *tb_sum = reinterpret_cast<T*>(smem);
 
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  const size_t local_i = threadIdx.x;
+  int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  const size_t local_i = hipThreadIdx_x;
 
   tb_sum[local_i] = 0.0;
-  for (; i < array_size; i += blockDim.x*gridDim.x)
+  for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
     tb_sum[local_i] += a[i] * b[i];
 
-  for (int offset = blockDim.x / 2; offset > 0; offset /= 2)
+  for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
   {
     __syncthreads();
     if (local_i < offset)
@@ -202,7 +209,7 @@ __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum,
   }
 
   if (local_i == 0)
-    sum[blockIdx.x] = tb_sum[local_i];
+    sum[hipBlockIdx_x] = tb_sum[local_i];
 }
 
 template <class T>

From 350a151c3b05ac251137d49adc2c18ebb3f08880 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 28 Feb 2017 10:04:36 +0100
Subject: [PATCH 56/76] removed CUDA_PATH sentinel from HIP.make

---
 HIP.make | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/HIP.make b/HIP.make
index bbd142e..35b0a6a 100644
--- a/HIP.make
+++ b/HIP.make
@@ -3,13 +3,7 @@
 
 HIPCC = hipcc
 
-ifndef CUDA_PATH
-ifeq (,$(wildcard /usr/local/bin/nvcc))
-$(error /usr/local/bin/nvcc not found, set CUDA_PATH instead)
-endif
-endif
-
-hip-stream: main.cpp HIPStream.cu
+hip-stream: main.cpp HIPStream.cpp
 	$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
 
 .PHONY: clean

From ceada6922f19449af7b64d3d0ded84dd4188afa2 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 28 Feb 2017 10:07:48 +0100
Subject: [PATCH 57/76] proper declaration of tb_sum with HIP_DYNAMIC_SHARED
 macro

---
 HIPStream.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/HIPStream.cpp b/HIPStream.cpp
index edbcac1..648b62a 100644
--- a/HIPStream.cpp
+++ b/HIPStream.cpp
@@ -188,9 +188,9 @@ template <class T>
 __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
 {
 
-  //extern __shared__ __align__(sizeof(T)) unsigned char smem[];
-  HIP_DYNAMIC_SHARED(unsigned char,smem);
-  T *tb_sum = reinterpret_cast<T*>(smem);
+  HIP_DYNAMIC_SHARED(T,tb_sum);
+  // HIP_DYNAMIC_SHARED(unsigned char,smem);
+  // T *tb_sum = reinterpret_cast<T*>(smem);
 
   int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
   const size_t local_i = hipThreadIdx_x;

From 58773a79b747b21860101d22af1d53c4f1e19186 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 28 Feb 2017 13:33:21 +0100
Subject: [PATCH 58/76] removed extra lines introduced by hipify, removed
 obsolete commented code

---
 HIPStream.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/HIPStream.cpp b/HIPStream.cpp
index 648b62a..dafe2cd 100644
--- a/HIPStream.cpp
+++ b/HIPStream.cpp
@@ -1,5 +1,3 @@
-
-
 // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
 // University of Bristol HPC
 //
@@ -189,8 +187,6 @@ __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum,
 {
 
   HIP_DYNAMIC_SHARED(T,tb_sum);
-  // HIP_DYNAMIC_SHARED(unsigned char,smem);
-  // T *tb_sum = reinterpret_cast<T*>(smem);
 
   int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
   const size_t local_i = hipThreadIdx_x;

From e570b458a604d3c44bbb9588552626a5250b5f82 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Tue, 28 Feb 2017 13:43:57 +0100
Subject: [PATCH 59/76] replaced - for = so that assignment takes place

---
 KOKKOSStream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index 9391a13..9abcf87 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -42,7 +42,7 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
   parallel_for(array_size, KOKKOS_LAMBDA (const int index)
   {
     a[index] = initA;
-    b[index] - initB;
+    b[index] = initB;
     c[index] = initC;
   });
   Kokkos::fence();

From 94e0900377e0cd49531042dd07c9134ae91999f2 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Tue, 28 Feb 2017 13:24:45 +0000
Subject: [PATCH 60/76] Use static shared memory in dot for CUDA and HIP

---
 CUDAStream.cu | 6 ++----
 HIPStream.cpp | 5 ++---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/CUDAStream.cu b/CUDAStream.cu
index 7b1e0df..9588456 100644
--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@@ -182,9 +182,7 @@ void CUDAStream<T>::triad()
 template <class T>
 __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
 {
-
-  extern __shared__ __align__(sizeof(T)) unsigned char smem[];
-  T *tb_sum = reinterpret_cast<T*>(smem);
+  __shared__ T tb_sum[TBSIZE];
 
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   const size_t local_i = threadIdx.x;
@@ -209,7 +207,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array
 template <class T>
 T CUDAStream<T>::dot()
 {
-  dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE, sizeof(T)*TBSIZE>>>(d_a, d_b, d_sum, array_size);
+  dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
   check_error();
 
   cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost);
diff --git a/HIPStream.cpp b/HIPStream.cpp
index dafe2cd..7bf724a 100644
--- a/HIPStream.cpp
+++ b/HIPStream.cpp
@@ -185,8 +185,7 @@ void HIPStream<T>::triad()
 template <class T>
 __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
 {
-
-  HIP_DYNAMIC_SHARED(T,tb_sum);
+  __shared__ T tb_sum[TBSIZE];
 
   int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
   const size_t local_i = hipThreadIdx_x;
@@ -211,7 +210,7 @@ __global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum,
 template <class T>
 T HIPStream<T>::dot()
 {
-  hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), sizeof(T)*TBSIZE, 0, d_a, d_b, d_sum, array_size);
+  hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
   check_error();
 
   hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);

From ea12f2a9a13041ad114b3828ef51e2e17aa12e88 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Mon, 13 Mar 2017 14:41:16 +0100
Subject: [PATCH 61/76] added EXTRA_FLAGS variable to CUDA Makefile to provide
 the freedom to specify debug flags or gencode flags

---
 CUDA.make | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CUDA.make b/CUDA.make
index ef193df..bf376eb 100644
--- a/CUDA.make
+++ b/CUDA.make
@@ -1,6 +1,7 @@
+EXTRA_FLAGS?=-O3
 
 cuda-stream: main.cpp CUDAStream.cu
-	nvcc -std=c++11 -O3 -DCUDA $^ $(EXTRA_FLAGS) -o $@
+	nvcc -std=c++11 -DCUDA $^ $(EXTRA_FLAGS) -o $@
 
 .PHONY: clean
 clean:

From 8c7a801a8422a145ced2eb3448eaa016d6a553d4 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Mon, 13 Mar 2017 15:22:26 +0100
Subject: [PATCH 62/76] put -O3 into CXXFLAGS to comply with OpenMP.make

---
 CUDA.make | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CUDA.make b/CUDA.make
index bf376eb..50f175c 100644
--- a/CUDA.make
+++ b/CUDA.make
@@ -1,7 +1,7 @@
-EXTRA_FLAGS?=-O3
+CXXFLAGS?=-O3 -std=c++11
 
 cuda-stream: main.cpp CUDAStream.cu
-	nvcc -std=c++11 -DCUDA $^ $(EXTRA_FLAGS) -o $@
+	nvcc $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
 
 .PHONY: clean
 clean:

From 4f288ddc3d171ed2b1a77c238ca7219e0301709b Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Mon, 13 Mar 2017 17:15:10 +0000
Subject: [PATCH 63/76] [OpenMP] Add -qopt-streaming-stores for Intel

---
 OpenMP.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/OpenMP.make b/OpenMP.make
index 28b5326..2028c46 100644
--- a/OpenMP.make
+++ b/OpenMP.make
@@ -29,7 +29,7 @@ COMPILER_XL = xlc++
 CXX = $(COMPILER_$(COMPILER))
 
 FLAGS_GNU = -O3 -std=c++11
-FLAGS_INTEL = -O3 -std=c++11 -xHOST
+FLAGS_INTEL = -O3 -std=c++11 -xHOST -qopt-streaming-stores=always
 FLAGS_CRAY = -O3 -hstd=c++11
 FLAGS_CLANG = -O3 -std=c++11
 FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11

From 703eb945a2ebd4bfc031b818776cab3e2a216cb0 Mon Sep 17 00:00:00 2001
From: James Price <j.price@bristol.ac.uk>
Date: Mon, 13 Mar 2017 17:17:20 +0000
Subject: [PATCH 64/76] [OpenMP] Align memory (2MB by default)

---
 OMPStream.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/OMPStream.cpp b/OMPStream.cpp
index 189cacb..06346e7 100644
--- a/OMPStream.cpp
+++ b/OMPStream.cpp
@@ -7,6 +7,10 @@
 
 #include "OMPStream.h"
 
+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
 template <class T>
 OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
 {
@@ -22,9 +26,9 @@ OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev
   {}
 #else
   // Allocate on the host
-  this->a = (T*)malloc(sizeof(T)*array_size);
-  this->b = (T*)malloc(sizeof(T)*array_size);
-  this->c = (T*)malloc(sizeof(T)*array_size);
+  this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #endif
 }
 

From d8cb7494e0c8e7beddaffadad97f0c7c0238ca71 Mon Sep 17 00:00:00 2001
From: Peter Steinbach <steinbac@mpi-cbg.de>
Date: Fri, 17 Mar 2017 15:18:13 +0100
Subject: [PATCH 65/76] pulled -O3 out into CXXFLAGS, refactored CUDA compiler
 into CUDA_CXX make variable to cope with clang as CUDA compiler as well

---
 CUDA.make | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CUDA.make b/CUDA.make
index 50f175c..3edf0f5 100644
--- a/CUDA.make
+++ b/CUDA.make
@@ -1,7 +1,8 @@
-CXXFLAGS?=-O3 -std=c++11
+CXXFLAGS=-O3
+CUDA_CXX=nvcc
 
 cuda-stream: main.cpp CUDAStream.cu
-	nvcc $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
+	$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
 
 .PHONY: clean
 clean:

From d9dfc3f552706cf38beb25a2bf2271e03adb4340 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 5 Apr 2017 21:57:55 +0100
Subject: [PATCH 66/76] [Kokkos] Use long for iterator variable

---
 KOKKOSStream.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index 9abcf87..45f4dff 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -39,7 +39,7 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
   View<double*, DEVICE> a(*d_a);
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     a[index] = initA;
     b[index] = initB;
@@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     c[index] = a[index];
   });
@@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
   View<double*, DEVICE> c(*d_c);
 
   const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     b[index] = scalar*c[index];
   });
@@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     c[index] = a[index] + b[index];
   });
@@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
   View<double*, DEVICE> c(*d_c);
 
   const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
 
   T sum = 0.0;
 
-  parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
+  parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
   {
     tmp += a[index] * b[index];
   }, sum);

From d7e38c1ca9d30e2b73cc7c4aa5b6b69f92269822 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 5 Apr 2017 22:09:58 +0100
Subject: [PATCH 67/76] Add Kokkos build instructions to README

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 1e2792d..1efd443 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,18 @@ Pass in extra flags via the `EXTRA_FLAGS` option.
 
 The binaries are named in the form `<model>-stream`.
 
+Building Kokkos
+---------------
+
+We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
+```
+For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
+```
+
 
 Results
 -------

From d7a93be73972b627b68370487838501e09f31ba4 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 5 Apr 2017 22:23:27 +0100
Subject: [PATCH 68/76] [Kokkos] Add a COMPILER option to Makefile, which turns
 on streaming stores for Intel

---
 Kokkos.make | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Kokkos.make b/Kokkos.make
index 4bba1c8..1c6207d 100644
--- a/Kokkos.make
+++ b/Kokkos.make
@@ -3,6 +3,21 @@ default: kokkos-stream
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU INTEL
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+COMPILER_GNU = g++
+COMPILER_INTEL = icpc -qopt-streaming-stores=always
+CXX = $(COMPILER_$(COMPILER))
+
 ifndef TARGET
 define target_help
 Set TARGET to change to offload device. Defaulting to CPU.

From 1eb75f034a8af3af53852b9e791b7cb91b6274a0 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:02:25 +0100
Subject: [PATCH 69/76] [RAJA] Use xHost and streaming stores with the Intel
 compiler

---
 RAJA.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RAJA.make b/RAJA.make
index 7fb821b..47aeefb 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -27,7 +27,7 @@ CXX_GNU   = g++
 CXX_CRAY  = CC
 CXX_XL    = xlc++
 
-CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp
+CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
 CXXFLAGS_GNU   = -O3 -std=c++11 -fopenmp
 CXXFLAGS_CRAY  = -O3 -hstd=c++11
 CXXFLAGS_XL    = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded

From 1bd4adfe7bad207deac51aae40f370dc4018953d Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:14:51 +0100
Subject: [PATCH 70/76] [RAJA] Align the memory to 2MB pages

---
 RAJAStream.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 240f160..8d3f365 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -10,6 +10,10 @@
 using RAJA::forall;
 using RAJA::RangeSegment;
 
+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
 template <class T>
 RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
     : array_size(ARRAY_SIZE)
@@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
   index_set.push_back(seg);
 
 #ifdef RAJA_TARGET_CPU
-  d_a = new T[ARRAY_SIZE];
-  d_b = new T[ARRAY_SIZE];
-  d_c = new T[ARRAY_SIZE];
+  d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
   cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
   cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@@ -33,9 +37,9 @@ template <class T>
 RAJAStream<T>::~RAJAStream()
 {
 #ifdef RAJA_TARGET_CPU
-  delete[] d_a;
-  delete[] d_b;
-  delete[] d_c;
+  free(d_a);
+  free(d_b);
+  free(d_c);
 #else
   cudaFree(d_a);
   cudaFree(d_b);

From 5f9b288570cb148245715deb8c3e5b62ba5f72b6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:15:11 +0100
Subject: [PATCH 71/76] [RAJA] Declare pointers using RAJA_RESTRICT

---
 RAJAStream.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 8d3f365..6c6098e 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -50,9 +50,9 @@ RAJAStream<T>::~RAJAStream()
 template <class T>
 void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     a[index] = initA;
@@ -73,8 +73,8 @@ void RAJAStream<T>::read_arrays(
 template <class T>
 void RAJAStream<T>::copy()
 {
-  T* a = d_a;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     c[index] = a[index];
@@ -84,8 +84,8 @@ void RAJAStream<T>::copy()
 template <class T>
 void RAJAStream<T>::mul()
 {
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
@@ -96,9 +96,9 @@ void RAJAStream<T>::mul()
 template <class T>
 void RAJAStream<T>::add()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     c[index] = a[index] + b[index];
@@ -108,9 +108,9 @@ void RAJAStream<T>::add()
 template <class T>
 void RAJAStream<T>::triad()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
@@ -121,8 +121,8 @@ void RAJAStream<T>::triad()
 template <class T>
 T RAJAStream<T>::dot()
 {
-  T* a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
 
   RAJA::ReduceSum<reduce_policy, T> sum(0.0);
 

From 3331f62f42eb89741f5ed01a393f3492a1331ebc Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:16:34 +0100
Subject: [PATCH 72/76] Add RAJA build instructions to README

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 1efd443..938cb81 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,14 @@ For building with CUDA support, we use the following command, specifying the `ar
 ../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
 ```
 
+Building RAJA
+-------------
+
+We use the following command to build RAJA using the Intel Compiler.
+```
+cmake ../../ -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+```
+
 
 Results
 -------

From c534600d04a78a3ec77e1818a291d6cae728b7c3 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:36:01 +0100
Subject: [PATCH 73/76] [RAJA] Use Index_type for iterator index type instead
 of hardcoding int

---
 RAJAStream.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 6c6098e..395a6ee 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -53,7 +53,7 @@ void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
   T* RAJA_RESTRICT a = d_a;
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     a[index] = initA;
     b[index] = initB;
@@ -75,7 +75,7 @@ void RAJAStream<T>::copy()
 {
   T* RAJA_RESTRICT a = d_a;
   T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     c[index] = a[index];
   });
@@ -87,7 +87,7 @@ void RAJAStream<T>::mul()
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     b[index] = scalar*c[index];
   });
@@ -99,7 +99,7 @@ void RAJAStream<T>::add()
   T* RAJA_RESTRICT a = d_a;
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     c[index] = a[index] + b[index];
   });
@@ -112,7 +112,7 @@ void RAJAStream<T>::triad()
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -126,7 +126,7 @@ T RAJAStream<T>::dot()
 
   RAJA::ReduceSum<reduce_policy, T> sum(0.0);
 
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     sum += a[index] * b[index];
   });

From 50e3a1970fcd9712ca6fb7513233b6e4e6ef9ca5 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:38:03 +0100
Subject: [PATCH 74/76] Add RAJA CUDA build instructions

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 938cb81..e1dcfdc 100644
--- a/README.md
+++ b/README.md
@@ -53,9 +53,12 @@ Building RAJA
 
 We use the following command to build RAJA using the Intel Compiler.
 ```
-cmake ../../ -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+```
+For building with CUDA support, we use the following command.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
 ```
-
 
 Results
 -------

From 9c08fdd18411b6adca8b2c94cae3e0ebaedc3845 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:38:48 +0100
Subject: [PATCH 75/76] Minor version bump

---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index e6f2cbd..2c33f22 100644
--- a/main.cpp
+++ b/main.cpp
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>
 
-#define VERSION_STRING "3.1"
+#define VERSION_STRING "3.2"
 
 #include "Stream.h"
 

From dafc63030ff916b613be39916beccd20a0327583 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Sat, 8 Apr 2017 12:16:29 +0100
Subject: [PATCH 76/76] Rename to BabelStream

---
 LICENSE   | 14 +++++++-------
 README.md | 14 ++++++++++----
 main.cpp  |  2 +-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/LICENSE b/LICENSE
index 1bc1114..1835c1a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -12,22 +12,22 @@
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
+*     3a. In order to be referred to as "BabelStream benchmark results",
+*         published results must be in conformance to the BabelStream
 *         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
+*         accordance with the BabelStream Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
+*         "tuned BabelStream benchmark results"
+*         "based on a variant of the BabelStream benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
+*     3c. Submission of results to the BabelStream benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
diff --git a/README.md b/README.md
index e1dcfdc..0f93be6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-GPU-STREAM
+BabelStream
 ==========
 
 Measure memory transfer rates to/from global device memory on GPUs.
@@ -16,9 +16,11 @@ Currently implemented are:
   - RAJA
   - SYCL
 
+This code was previously called GPU-STREAM.
+
 Website
 -------
-[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/)
+[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
 
 Usage
 -----
@@ -68,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
 Citing
 ------
 
-You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
+Please cite BabelStream via this reference:
 
 > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
 
-**Other GPU-STREAM publications:**
+**Other BabelStream publications:**
 
 > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
+You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
+
+> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
+You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
 
 
 
diff --git a/main.cpp b/main.cpp
index 2c33f22..33cef1e 100644
--- a/main.cpp
+++ b/main.cpp
@@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
 {
   std::cout
-    << "GPU-STREAM" << std::endl
+    << "BabelStream" << std::endl
     << "Version: " << VERSION_STRING << std::endl
     << "Implementation: " << IMPLEMENTATION_STRING << std::endl;