From d9dfc3f552706cf38beb25a2bf2271e03adb4340 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 5 Apr 2017 21:57:55 +0100 Subject: [PATCH 01/11] [Kokkos] Use long for iterator variable --- KOKKOSStream.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 9abcf87..45f4dff 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -39,7 +39,7 @@ void KOKKOSStream::init_arrays(T initA, T initB, T initC) View a(*d_a); View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { a[index] = initA; b[index] = initB; @@ -70,7 +70,7 @@ void KOKKOSStream::copy() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { c[index] = a[index]; }); @@ -85,7 +85,7 @@ void KOKKOSStream::mul() View c(*d_c); const T scalar = startScalar; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { b[index] = scalar*c[index]; }); @@ -99,7 +99,7 @@ void KOKKOSStream::add() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { c[index] = a[index] + b[index]; }); @@ -114,7 +114,7 @@ void KOKKOSStream::triad() View c(*d_c); const T scalar = startScalar; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { a[index] = b[index] + scalar*c[index]; }); @@ -129,7 +129,7 @@ T KOKKOSStream::dot() T sum = 0.0; - parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp) + parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp) { tmp += a[index] * b[index]; }, sum); From d7e38c1ca9d30e2b73cc7c4aa5b6b69f92269822 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 5 Apr 2017 22:09:58 +0100 Subject: [PATCH 02/11] Add Kokkos build instructions to README --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 1e2792d..1efd443 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,18 @@ Pass in extra flags via the `EXTRA_FLAGS` option. The binaries are named in the form `-stream`. +Building Kokkos +--------------- + +We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`. +``` +../generate_makefile.bash --prefix= --with-openmp --with-pthread --arch= --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152 +``` +For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`. +``` +../generate_makefile.bash --prefix= --with-cuda --with-openmp --with-pthread --arch= --with-cuda-options=enable_lambda +``` + Results ------- From d7a93be73972b627b68370487838501e09f31ba4 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Wed, 5 Apr 2017 22:23:27 +0100 Subject: [PATCH 03/11] [Kokkos] Add a COMPILER option to Makefile, which turns on streaming stores for Intel --- Kokkos.make | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Kokkos.make b/Kokkos.make index 4bba1c8..1c6207d 100644 --- a/Kokkos.make +++ b/Kokkos.make @@ -3,6 +3,21 @@ default: kokkos-stream include $(KOKKOS_PATH)/Makefile.kokkos +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU INTEL + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + +COMPILER_GNU = g++ +COMPILER_INTEL = icpc -qopt-streaming-stores=always +CXX = $(COMPILER_$(COMPILER)) + ifndef TARGET define target_help Set TARGET to change to offload device. Defaulting to CPU. From 1eb75f034a8af3af53852b9e791b7cb91b6274a0 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:02:25 +0100 Subject: [PATCH 04/11] [RAJA] Use xHost and streaming stores with the Intel compiler --- RAJA.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RAJA.make b/RAJA.make index 7fb821b..47aeefb 100644 --- a/RAJA.make +++ b/RAJA.make @@ -27,7 +27,7 @@ CXX_GNU = g++ CXX_CRAY = CC CXX_XL = xlc++ -CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp +CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp CXXFLAGS_CRAY = -O3 -hstd=c++11 CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded From 1bd4adfe7bad207deac51aae40f370dc4018953d Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:14:51 +0100 Subject: [PATCH 05/11] [RAJA] Align the memory to 2MB pages --- RAJAStream.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 240f160..8d3f365 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -10,6 +10,10 @@ using RAJA::forall; using RAJA::RangeSegment; +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + template RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) : array_size(ARRAY_SIZE) @@ -18,9 +22,9 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) index_set.push_back(seg); #ifdef RAJA_TARGET_CPU - d_a = new T[ARRAY_SIZE]; - d_b = new T[ARRAY_SIZE]; - d_c = new T[ARRAY_SIZE]; + d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); #else cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); @@ -33,9 +37,9 @@ template RAJAStream::~RAJAStream() { #ifdef RAJA_TARGET_CPU - delete[] d_a; - delete[] d_b; - delete[] d_c; + free(d_a); + free(d_b); + free(d_c); #else cudaFree(d_a); cudaFree(d_b); From 5f9b288570cb148245715deb8c3e5b62ba5f72b6 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:15:11 +0100 Subject: [PATCH 06/11] [RAJA] Declare pointers using RAJA_RESTRICT --- RAJAStream.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 8d3f365..6c6098e 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -50,9 +50,9 @@ RAJAStream::~RAJAStream() template void RAJAStream::init_arrays(T initA, T initB, T initC) { - T* a = d_a; - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { a[index] = initA; @@ -73,8 +73,8 @@ void RAJAStream::read_arrays( template void RAJAStream::copy() { - T* a = d_a; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { c[index] = a[index]; @@ -84,8 +84,8 @@ void RAJAStream::copy() template void RAJAStream::mul() { - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; forall(index_set, [=] RAJA_DEVICE (int index) { @@ -96,9 +96,9 @@ void RAJAStream::mul() template void RAJAStream::add() { - T* a = d_a; - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { c[index] = a[index] + b[index]; @@ -108,9 +108,9 @@ void RAJAStream::add() template void RAJAStream::triad() { - T* a = d_a; - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; forall(index_set, [=] RAJA_DEVICE (int index) { @@ -121,8 +121,8 @@ void RAJAStream::triad() template T RAJAStream::dot() { - T* a = d_a; - T* b = d_b; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; RAJA::ReduceSum sum(0.0); From 3331f62f42eb89741f5ed01a393f3492a1331ebc Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:16:34 +0100 Subject: [PATCH 07/11] Add RAJA build instructions to README --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 1efd443..938cb81 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,14 @@ For building with CUDA support, we use the following command, specifying the `ar ../generate_makefile.bash --prefix= --with-cuda --with-openmp --with-pthread --arch= --with-cuda-options=enable_lambda ``` +Building RAJA +------------- + +We use the following command to build RAJA using the Intel Compiler. +``` +cmake ../../ -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off +``` + Results ------- From c534600d04a78a3ec77e1818a291d6cae728b7c3 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:36:01 +0100 Subject: [PATCH 08/11] [RAJA] Use Index_type for iterator index type instead of hardcoding int --- RAJAStream.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 6c6098e..395a6ee 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -53,7 +53,7 @@ void RAJAStream::init_arrays(T initA, T initB, T initC) T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { a[index] = initA; b[index] = initB; @@ -75,7 +75,7 @@ void RAJAStream::copy() { T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { c[index] = a[index]; }); @@ -87,7 +87,7 @@ void RAJAStream::mul() T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { b[index] = scalar*c[index]; }); @@ -99,7 +99,7 @@ void RAJAStream::add() T* RAJA_RESTRICT a = d_a; T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { c[index] = a[index] + b[index]; }); @@ -112,7 +112,7 @@ void RAJAStream::triad() T* RAJA_RESTRICT b = d_b; T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { a[index] = b[index] + scalar*c[index]; }); @@ -126,7 +126,7 @@ T RAJAStream::dot() RAJA::ReduceSum sum(0.0); - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { sum += a[index] * b[index]; }); From 50e3a1970fcd9712ca6fb7513233b6e4e6ef9ca5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:38:03 +0100 Subject: [PATCH 09/11] Add RAJA CUDA build instructions --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 938cb81..e1dcfdc 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,12 @@ Building RAJA We use the following command to build RAJA using the Intel Compiler. ``` -cmake ../../ -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off +cmake .. -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off +``` +For building with CUDA support, we use the following command. +``` +cmake .. -DCMAKE_INSTALL_PREFIX= -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off ``` - Results ------- From 9c08fdd18411b6adca8b2c94cae3e0ebaedc3845 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Thu, 6 Apr 2017 10:38:48 +0100 Subject: [PATCH 10/11] Minor version bump --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index e6f2cbd..2c33f22 100644 --- a/main.cpp +++ b/main.cpp @@ -15,7 +15,7 @@ #include #include -#define VERSION_STRING "3.1" +#define VERSION_STRING "3.2" #include "Stream.h" From dafc63030ff916b613be39916beccd20a0327583 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Sat, 8 Apr 2017 12:16:29 +0100 Subject: [PATCH 11/11] Rename to BabelStream --- LICENSE | 14 +++++++------- README.md | 14 ++++++++++---- main.cpp | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/LICENSE b/LICENSE index 1bc1114..1835c1a 100644 --- a/LICENSE +++ b/LICENSE @@ -12,22 +12,22 @@ * 3. You are free to publish results obtained from running this * program, or from works that you derive from this program, * with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM +* 3a. In order to be referred to as "BabelStream benchmark results", +* published results must be in conformance to the BabelStream * Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules +* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules * and incorporated herein by reference. * The copyright holders retain the * right to determine conformity with the Run Rules. * 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly +* accordance with the BabelStream Run Rules must be clearly * labelled whenever they are published. Examples of * proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" +* "tuned BabelStream benchmark results" +* "based on a variant of the BabelStream benchmark code" * Other comparable, clear and reasonable labelling is * acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site +* 3c. Submission of results to the BabelStream benchmark web site * is encouraged, but not required. * 4. Use of this program or creation of derived works based on this * program constitutes acceptance of these licensing restrictions. diff --git a/README.md b/README.md index e1dcfdc..0f93be6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -GPU-STREAM +BabelStream ========== Measure memory transfer rates to/from global device memory on GPUs. @@ -16,9 +16,11 @@ Currently implemented are: - RAJA - SYCL +This code was previously called GPU-STREAM. + Website ------- -[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/) +[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/) Usage ----- @@ -68,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to Citing ------ -You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference: +Please cite BabelStream via this reference: > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. -**Other GPU-STREAM publications:** +**Other BabelStream publications:** > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. +You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). + +> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States. +You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html). diff --git a/main.cpp b/main.cpp index 2c33f22..33cef1e 100644 --- a/main.cpp +++ b/main.cpp @@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]); int main(int argc, char *argv[]) { std::cout - << "GPU-STREAM" << std::endl + << "BabelStream" << std::endl << "Version: " << VERSION_STRING << std::endl << "Implementation: " << IMPLEMENTATION_STRING << std::endl;