From d9dfc3f552706cf38beb25a2bf2271e03adb4340 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 5 Apr 2017 21:57:55 +0100
Subject: [PATCH 01/11] [Kokkos] Use long for iterator variable

---
 KOKKOSStream.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index 9abcf87..45f4dff 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -39,7 +39,7 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
   View<double*, DEVICE> a(*d_a);
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     a[index] = initA;
     b[index] = initB;
@@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     c[index] = a[index];
   });
@@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
   View<double*, DEVICE> c(*d_c);
 
   const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     b[index] = scalar*c[index];
   });
@@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     c[index] = a[index] + b[index];
   });
@@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
   View<double*, DEVICE> c(*d_c);
 
   const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
 
   T sum = 0.0;
 
-  parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
+  parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
   {
     tmp += a[index] * b[index];
   }, sum);

From d7e38c1ca9d30e2b73cc7c4aa5b6b69f92269822 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 5 Apr 2017 22:09:58 +0100
Subject: [PATCH 02/11] Add Kokkos build instructions to README

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 1e2792d..1efd443 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,18 @@ Pass in extra flags via the `EXTRA_FLAGS` option.
 
 The binaries are named in the form `<model>-stream`.
 
+Building Kokkos
+---------------
+
+We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
+```
+For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
+```
+
 
 Results
 -------

From d7a93be73972b627b68370487838501e09f31ba4 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Wed, 5 Apr 2017 22:23:27 +0100
Subject: [PATCH 03/11] [Kokkos] Add a COMPILER option to Makefile, which turns
 on streaming stores for Intel

---
 Kokkos.make | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Kokkos.make b/Kokkos.make
index 4bba1c8..1c6207d 100644
--- a/Kokkos.make
+++ b/Kokkos.make
@@ -3,6 +3,21 @@ default: kokkos-stream
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU INTEL
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+COMPILER_GNU = g++
+COMPILER_INTEL = icpc -qopt-streaming-stores=always
+CXX = $(COMPILER_$(COMPILER))
+
 ifndef TARGET
 define target_help
 Set TARGET to change to offload device. Defaulting to CPU.

From 1eb75f034a8af3af53852b9e791b7cb91b6274a0 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:02:25 +0100
Subject: [PATCH 04/11] [RAJA] Use xHost and streaming stores with the Intel
 compiler

---
 RAJA.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RAJA.make b/RAJA.make
index 7fb821b..47aeefb 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -27,7 +27,7 @@ CXX_GNU   = g++
 CXX_CRAY  = CC
 CXX_XL    = xlc++
 
-CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp
+CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
 CXXFLAGS_GNU   = -O3 -std=c++11 -fopenmp
 CXXFLAGS_CRAY  = -O3 -hstd=c++11
 CXXFLAGS_XL    = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded

From 1bd4adfe7bad207deac51aae40f370dc4018953d Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:14:51 +0100
Subject: [PATCH 05/11] [RAJA] Align the memory to 2MB pages

---
 RAJAStream.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 240f160..8d3f365 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -10,6 +10,10 @@
 using RAJA::forall;
 using RAJA::RangeSegment;
 
+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
 template <class T>
 RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
     : array_size(ARRAY_SIZE)
@@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
   index_set.push_back(seg);
 
 #ifdef RAJA_TARGET_CPU
-  d_a = new T[ARRAY_SIZE];
-  d_b = new T[ARRAY_SIZE];
-  d_c = new T[ARRAY_SIZE];
+  d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
   cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
   cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@@ -33,9 +37,9 @@ template <class T>
 RAJAStream<T>::~RAJAStream()
 {
 #ifdef RAJA_TARGET_CPU
-  delete[] d_a;
-  delete[] d_b;
-  delete[] d_c;
+  free(d_a);
+  free(d_b);
+  free(d_c);
 #else
   cudaFree(d_a);
   cudaFree(d_b);

From 5f9b288570cb148245715deb8c3e5b62ba5f72b6 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:15:11 +0100
Subject: [PATCH 06/11] [RAJA] Declare pointers using RAJA_RESTRICT

---
 RAJAStream.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 8d3f365..6c6098e 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -50,9 +50,9 @@ RAJAStream<T>::~RAJAStream()
 template <class T>
 void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     a[index] = initA;
@@ -73,8 +73,8 @@ void RAJAStream<T>::read_arrays(
 template <class T>
 void RAJAStream<T>::copy()
 {
-  T* a = d_a;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     c[index] = a[index];
@@ -84,8 +84,8 @@ void RAJAStream<T>::copy()
 template <class T>
 void RAJAStream<T>::mul()
 {
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
@@ -96,9 +96,9 @@ void RAJAStream<T>::mul()
 template <class T>
 void RAJAStream<T>::add()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
     c[index] = a[index] + b[index];
@@ -108,9 +108,9 @@ void RAJAStream<T>::add()
 template <class T>
 void RAJAStream<T>::triad()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
   forall<policy>(index_set, [=] RAJA_DEVICE (int index)
   {
@@ -121,8 +121,8 @@ void RAJAStream<T>::triad()
 template <class T>
 T RAJAStream<T>::dot()
 {
-  T* a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
 
   RAJA::ReduceSum<reduce_policy, T> sum(0.0);
 

From 3331f62f42eb89741f5ed01a393f3492a1331ebc Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:16:34 +0100
Subject: [PATCH 07/11] Add RAJA build instructions to README

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 1efd443..938cb81 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,14 @@ For building with CUDA support, we use the following command, specifying the `ar
 ../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
 ```
 
+Building RAJA
+-------------
+
+We use the following command to build RAJA using the Intel Compiler.
+```
+cmake ../../ -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+```
+
 
 Results
 -------

From c534600d04a78a3ec77e1818a291d6cae728b7c3 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:36:01 +0100
Subject: [PATCH 08/11] [RAJA] Use Index_type for iterator index type instead
 of hardcoding int

---
 RAJAStream.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 6c6098e..395a6ee 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -53,7 +53,7 @@ void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
   T* RAJA_RESTRICT a = d_a;
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     a[index] = initA;
     b[index] = initB;
@@ -75,7 +75,7 @@ void RAJAStream<T>::copy()
 {
   T* RAJA_RESTRICT a = d_a;
   T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     c[index] = a[index];
   });
@@ -87,7 +87,7 @@ void RAJAStream<T>::mul()
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     b[index] = scalar*c[index];
   });
@@ -99,7 +99,7 @@ void RAJAStream<T>::add()
   T* RAJA_RESTRICT a = d_a;
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     c[index] = a[index] + b[index];
   });
@@ -112,7 +112,7 @@ void RAJAStream<T>::triad()
   T* RAJA_RESTRICT b = d_b;
   T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -126,7 +126,7 @@ T RAJAStream<T>::dot()
 
   RAJA::ReduceSum<reduce_policy, T> sum(0.0);
 
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     sum += a[index] * b[index];
   });

From 50e3a1970fcd9712ca6fb7513233b6e4e6ef9ca5 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:38:03 +0100
Subject: [PATCH 09/11] Add RAJA CUDA build instructions

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 938cb81..e1dcfdc 100644
--- a/README.md
+++ b/README.md
@@ -53,9 +53,12 @@ Building RAJA
 
 We use the following command to build RAJA using the Intel Compiler.
 ```
-cmake ../../ -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+```
+For building with CUDA support, we use the following command.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
 ```
-
 
 Results
 -------

From 9c08fdd18411b6adca8b2c94cae3e0ebaedc3845 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Thu, 6 Apr 2017 10:38:48 +0100
Subject: [PATCH 10/11] Minor version bump

---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index e6f2cbd..2c33f22 100644
--- a/main.cpp
+++ b/main.cpp
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>
 
-#define VERSION_STRING "3.1"
+#define VERSION_STRING "3.2"
 
 #include "Stream.h"
 

From dafc63030ff916b613be39916beccd20a0327583 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Sat, 8 Apr 2017 12:16:29 +0100
Subject: [PATCH 11/11] Rename to BabelStream

---
 LICENSE   | 14 +++++++-------
 README.md | 14 ++++++++++----
 main.cpp  |  2 +-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/LICENSE b/LICENSE
index 1bc1114..1835c1a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -12,22 +12,22 @@
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
+*     3a. In order to be referred to as "BabelStream benchmark results",
+*         published results must be in conformance to the BabelStream
 *         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
+*         accordance with the BabelStream Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
+*         "tuned BabelStream benchmark results"
+*         "based on a variant of the BabelStream benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
+*     3c. Submission of results to the BabelStream benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
diff --git a/README.md b/README.md
index e1dcfdc..0f93be6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-GPU-STREAM
+BabelStream
 ==========
 
 Measure memory transfer rates to/from global device memory on GPUs.
@@ -16,9 +16,11 @@ Currently implemented are:
   - RAJA
   - SYCL
 
+This code was previously called GPU-STREAM.
+
 Website
 -------
-[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/)
+[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
 
 Usage
 -----
@@ -68,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
 Citing
 ------
 
-You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
+Please cite BabelStream via this reference:
 
 > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
 
-**Other GPU-STREAM publications:**
+**Other BabelStream publications:**
 
 > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
+You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
+
+> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
+You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
 
 
 
diff --git a/main.cpp b/main.cpp
index 2c33f22..33cef1e 100644
--- a/main.cpp
+++ b/main.cpp
@@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
 {
   std::cout
-    << "GPU-STREAM" << std::endl
+    << "BabelStream" << std::endl
     << "Version: " << VERSION_STRING << std::endl
     << "Implementation: " << IMPLEMENTATION_STRING << std::endl;