Merge branch 'master' of https://github.com/UoB-HPC/BabelStream into rocm_hc_support

2017-07-25 17:05:31 +02:00 · 2017-07-25 17:05:31 +02:00 · 26279688d1
commit 26279688d1
parent 6712e26035 dafc63030f
7 changed files with 95 additions and 47 deletions
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@ -39,7 +39,7 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
  View<double*, DEVICE> a(*d_a);
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    a[index] = initA;
    b[index] = initB;
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);

-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    c[index] = a[index];
  });
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
  View<double*, DEVICE> c(*d_c);

  const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    b[index] = scalar*c[index];
  });
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
  View<double*, DEVICE> b(*d_b);
  View<double*, DEVICE> c(*d_c);

-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    c[index] = a[index] + b[index];
  });
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
  View<double*, DEVICE> c(*d_c);

  const T scalar = startScalar;
-  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
+  parallel_for(array_size, KOKKOS_LAMBDA (const long index)
  {
    a[index] = b[index] + scalar*c[index];
  });
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()

  T sum = 0.0;

-  parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
+  parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
  {
    tmp += a[index] * b[index];
  }, sum);
--- a/Kokkos.make
+++ b/Kokkos.make
@ -3,6 +3,21 @@ default: kokkos-stream

 include $(KOKKOS_PATH)/Makefile.kokkos

+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU INTEL
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+COMPILER_GNU = g++
+COMPILER_INTEL = icpc -qopt-streaming-stores=always
+CXX = $(COMPILER_$(COMPILER))
+
 ifndef TARGET
 define target_help
 Set TARGET to change to offload device. Defaulting to CPU.
--- a/14
+++ b/14
@ -12,22 +12,22 @@
 *  3. You are free to publish results obtained from running this
 *     program, or from works that you derive from this program,
 *     with the following limitations:
-*     3a. In order to be referred to as "GPU-STREAM benchmark results",
-*         published results must be in conformance to the GPU-STREAM
+*     3a. In order to be referred to as "BabelStream benchmark results",
+*         published results must be in conformance to the BabelStream
 *         Run Rules published at
-*         http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
+*         http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
 *         and incorporated herein by reference.
 *         The copyright holders retain the
 *         right to determine conformity with the Run Rules.
 *     3b. Results based on modified source code or on runs not in
-*         accordance with the GPU-STREAM Run Rules must be clearly
+*         accordance with the BabelStream Run Rules must be clearly
 *         labelled whenever they are published.  Examples of
 *         proper labelling include:
-*         "tuned GPU-STREAM benchmark results"
-*         "based on a variant of the GPU-STREAM benchmark code"
+*         "tuned BabelStream benchmark results"
+*         "based on a variant of the BabelStream benchmark code"
 *         Other comparable, clear and reasonable labelling is
 *         acceptable.
-*     3c. Submission of results to the GPU-STREAM benchmark web site
+*     3c. Submission of results to the BabelStream benchmark web site
 *         is encouraged, but not required.
 *  4. Use of this program or creation of derived works based on this
 *     program constitutes acceptance of these licensing restrictions.
--- a/RAJA.make
+++ b/RAJA.make
@ -27,7 +27,7 @@ CXX_GNU   = g++
 CXX_CRAY  = CC
 CXX_XL    = xlc++

-CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp
+CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
 CXXFLAGS_GNU   = -O3 -std=c++11 -fopenmp
 CXXFLAGS_CRAY  = -O3 -hstd=c++11
 CXXFLAGS_XL    = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -10,6 +10,10 @@
 using RAJA::forall;
 using RAJA::RangeSegment;

+#ifndef ALIGNMENT
+#define ALIGNMENT (2*1024*1024) // 2MB
+#endif
+
 template <class T>
 RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
    : array_size(ARRAY_SIZE)
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
  index_set.push_back(seg);

 #ifdef RAJA_TARGET_CPU
-  d_a = new T[ARRAY_SIZE];
-  d_b = new T[ARRAY_SIZE];
-  d_c = new T[ARRAY_SIZE];
+  d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
+  d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
 #else
  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@ -33,9 +37,9 @@ template <class T>
 RAJAStream<T>::~RAJAStream()
 {
 #ifdef RAJA_TARGET_CPU
-  delete[] d_a;
-  delete[] d_b;
-  delete[] d_c;
+  free(d_a);
+  free(d_b);
+  free(d_c);
 #else
  cudaFree(d_a);
  cudaFree(d_b);
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
 template <class T>
 void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = initA;
    b[index] = initB;
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
 template <class T>
 void RAJAStream<T>::copy()
 {
-  T* a = d_a;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index];
  });
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
 template <class T>
 void RAJAStream<T>::mul()
 {
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    b[index] = scalar*c[index];
  });
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
 template <class T>
 void RAJAStream<T>::add()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    c[index] = a[index] + b[index];
  });
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
 template <class T>
 void RAJAStream<T>::triad()
 {
-  T* a = d_a;
-  T* b = d_b;
-  T* c = d_c;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;
+  T* RAJA_RESTRICT c = d_c;
  const T scalar = startScalar;
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    a[index] = b[index] + scalar*c[index];
  });
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
 template <class T>
 T RAJAStream<T>::dot()
 {
-  T* a = d_a;
-  T* b = d_b;
+  T* RAJA_RESTRICT a = d_a;
+  T* RAJA_RESTRICT b = d_b;

  RAJA::ReduceSum<reduce_policy, T> sum(0.0);

-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
  {
    sum += a[index] * b[index];
  });
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-GPU-STREAM
+BabelStream
 ==========

 Measure memory transfer rates to/from global device memory on GPUs.
@ -16,9 +16,11 @@ Currently implemented are:
  - RAJA
  - SYCL

+This code was previously called GPU-STREAM.
+
 Website
 -------
-[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/)
+[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)

 Usage
 -----
@ -36,6 +38,29 @@ Pass in extra flags via the `EXTRA_FLAGS` option.

 The binaries are named in the form `<model>-stream`.

+Building Kokkos
+---------------
+
+We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
+```
+For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
+```
+../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
+```
+
+Building RAJA
+-------------
+
+We use the following command to build RAJA using the Intel Compiler.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
+```
+For building with CUDA support, we use the following command.
+```
+cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
+```

 Results
 -------
@ -45,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
 Citing
 ------

-You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
+Please cite BabelStream via this reference:

 > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.

-**Other GPU-STREAM publications:**
+**Other BabelStream publications:**

 > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
+You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
+
+> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
+You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).



--- a/main.cpp
+++ b/main.cpp
@ -15,7 +15,7 @@
 #include <iomanip>
 #include <cstring>

-#define VERSION_STRING "3.1"
+#define VERSION_STRING "3.2"

 #include "Stream.h"

@ -56,7 +56,7 @@ void parseArguments(int argc, char *argv[]);
 int main(int argc, char *argv[])
 {
  std::cout
-    << "GPU-STREAM" << std::endl
+    << "BabelStream" << std::endl
    << "Version: " << VERSION_STRING << std::endl
    << "Implementation: " << IMPLEMENTATION_STRING << std::endl;