diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp
index 9abcf87..45f4dff 100644
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@@ -39,7 +39,7 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
   View<double*, DEVICE> a(*d_a);
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     a[index] = initA;
     b[index] = initB;
@@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     c[index] = a[index];
   });
@@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
   View<double*, DEVICE> c(*d_c);
 
   const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     b[index] = scalar*c[index];
   });
@@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
   View<double*, DEVICE> b(*d_b);
   View<double*, DEVICE> c(*d_c);
 
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     c[index] = a[index] + b[index];
   });
@@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
   View<double*, DEVICE> c(*d_c);
 
   const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
 
   T sum = 0.0;
 
-  parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
+  parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
   {
     tmp += a[index] * b[index];
   }, sum);
diff --git a/Kokkos.make b/Kokkos.make
index 4bba1c8..1c6207d 100644
--- a/Kokkos.make
+++ b/Kokkos.make
@@ -3,6 +3,21 @@ default: kokkos-stream
 
 include $(KOKKOS_PATH)/Makefile.kokkos
 
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU INTEL
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+COMPILER_GNU = g++
+COMPILER_INTEL = icpc -qopt-streaming-stores=always
+CXX = $(COMPILER_$(COMPILER))
+
 ifndef TARGET
 define target_help
 Set TARGET to change to offload device. Defaulting to CPU.
diff --git a/LICENSE b/LICENSE
index 1bc1114..1835c1a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -12,22 +12,22 @@
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
+*     3a. In order to be referred to as "BabelStream benchmark results",
+*         published results must be in conformance to the BabelStream
 *         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
+*         accordance with the BabelStream Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
+*         "tuned BabelStream benchmark results"
+*         "based on a variant of the BabelStream benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
+*     3c. Submission of results to the BabelStream benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
diff --git a/RAJA.make b/RAJA.make
index 7fb821b..47aeefb 100644
--- a/RAJA.make
+++ b/RAJA.make
@@ -27,7 +27,7 @@ CXX_GNU   = g++
 CXX_CRAY  = CC
 CXX_XL    = xlc++
 
-CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp
+CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
 CXXFLAGS_GNU   = -O3 -std=c++11 -fopenmp
 CXXFLAGS_CRAY  = -O3 -hstd=c++11
 CXXFLAGS_XL    = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
diff --git a/RAJAStream.cpp b/RAJAStream.cpp
index 240f160..395a6ee 100644
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@@ -10,6 +10,10 @@
 using RAJA::forall;
 using RAJA::RangeSegment;
 
+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
 template <class T>
 RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
     : array_size(ARRAY_SIZE)
@@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
   index_set.push_back(seg);
 
 #ifdef RAJA_TARGET_CPU
-  d_a = new T[ARRAY_SIZE];
-  d_b = new T[ARRAY_SIZE];
-  d_c = new T[ARRAY_SIZE];
+  d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
   cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
   cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@@ -33,9 +37,9 @@ template <class T>
 RAJAStream<T>::~RAJAStream()
 {
 #ifdef RAJA_TARGET_CPU
-  delete[] d_a;
-  delete[] d_b;
-  delete[] d_c;
+  free(d_a);
+  free(d_b);
+  free(d_c);
 #else
   cudaFree(d_a);
   cudaFree(d_b);
@@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
 template <class T>
 void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     a[index] = initA;
     b[index] = initB;
@@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
 template <class T>
 void RAJAStream<T>::copy()
 {
-  T* a = d_a;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     c[index] = a[index];
   });
@@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
 template <class T>
 void RAJAStream<T>::mul()
 {
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     b[index] = scalar*c[index];
   });
@@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
 template <class T>
 void RAJAStream<T>::add()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     c[index] = a[index] + b[index];
   });
@@ -104,11 +108,11 @@ void RAJAStream<T>::add()
 template <class T>
 void RAJAStream<T>::triad()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
   const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     a[index] = b[index] + scalar*c[index];
   });
@@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
 template <class T>
 T RAJAStream<T>::dot()
 {
-  T* a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
 
   RAJA::ReduceSum<reduce_policy, T> sum(0.0);
 
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
   {
     sum += a[index] * b[index];
   });
diff --git a/README.md b/README.md
index 1e2792d..0f93be6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-GPU-STREAM
+BabelStream
 ==========
 
 Measure memory transfer rates to/from global device memory on GPUs.
@@ -16,9 +16,11 @@ Currently implemented are:
   - RAJA
   - SYCL
 
+This code was previously called GPU-STREAM.
+
 Website
 -------
-[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/)
+[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
 
 Usage
 -----
@@ -36,6 +38,29 @@ Pass in extra flags via the `EXTRA_FLAGS` option.
 
 The binaries are named in the form `<model>-stream`.
 
+Building Kokkos
+---------------
+
+We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
+```
+For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
+```
+
+Building RAJA
+-------------
+
+We use the following command to build RAJA using the Intel Compiler.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+```
+For building with CUDA support, we use the following command.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
+```
 
 Results
 -------
@@ -45,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
 Citing
 ------
 
-You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
+Please cite BabelStream via this reference:
 
 > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
 
-**Other GPU-STREAM publications:**
+**Other BabelStream publications:**
 
 > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
+You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
+
+> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
+You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
 
 
 
diff --git a/main.cpp b/main.cpp
index 5cd4ddc..8a2108c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>
 
-#define VERSION_STRING "3.1"
+#define VERSION_STRING "3.2"
 
 #include "Stream.h"
 
@@ -56,7 +56,7 @@ void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
 {
   std::cout
-    << "GPU-STREAM" << std::endl
+    << "BabelStream" << std::endl
     << "Version: " << VERSION_STRING << std::endl
     << "Implementation: " << IMPLEMENTATION_STRING << std::endl;