diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 9abcf87..45f4dff 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -39,7 +39,7 @@ void KOKKOSStream::init_arrays(T initA, T initB, T initC) View a(*d_a); View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { a[index] = initA; b[index] = initB; @@ -70,7 +70,7 @@ void KOKKOSStream::copy() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { c[index] = a[index]; }); @@ -85,7 +85,7 @@ void KOKKOSStream::mul() View c(*d_c); const T scalar = startScalar; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { b[index] = scalar*c[index]; }); @@ -99,7 +99,7 @@ void KOKKOSStream::add() View b(*d_b); View c(*d_c); - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { c[index] = a[index] + b[index]; }); @@ -114,7 +114,7 @@ void KOKKOSStream::triad() View c(*d_c); const T scalar = startScalar; - parallel_for(array_size, KOKKOS_LAMBDA (const int index) + parallel_for(array_size, KOKKOS_LAMBDA (const long index) { a[index] = b[index] + scalar*c[index]; }); @@ -129,7 +129,7 @@ T KOKKOSStream::dot() T sum = 0.0; - parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp) + parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp) { tmp += a[index] * b[index]; }, sum); diff --git a/Kokkos.make b/Kokkos.make index 4bba1c8..1c6207d 100644 --- a/Kokkos.make +++ b/Kokkos.make @@ -3,6 +3,21 @@ default: kokkos-stream include $(KOKKOS_PATH)/Makefile.kokkos +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU INTEL + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + +COMPILER_GNU = g++ +COMPILER_INTEL = icpc -qopt-streaming-stores=always +CXX = $(COMPILER_$(COMPILER)) + ifndef TARGET define target_help Set TARGET to change to offload device. Defaulting to CPU. diff --git a/LICENSE b/LICENSE index 1bc1114..1835c1a 100644 --- a/LICENSE +++ b/LICENSE @@ -12,22 +12,22 @@ * 3. You are free to publish results obtained from running this * program, or from works that you derive from this program, * with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM +* 3a. In order to be referred to as "BabelStream benchmark results", +* published results must be in conformance to the BabelStream * Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules +* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules * and incorporated herein by reference. * The copyright holders retain the * right to determine conformity with the Run Rules. * 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly +* accordance with the BabelStream Run Rules must be clearly * labelled whenever they are published. Examples of * proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" +* "tuned BabelStream benchmark results" +* "based on a variant of the BabelStream benchmark code" * Other comparable, clear and reasonable labelling is * acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site +* 3c. Submission of results to the BabelStream benchmark web site * is encouraged, but not required. * 4. Use of this program or creation of derived works based on this * program constitutes acceptance of these licensing restrictions. diff --git a/RAJA.make b/RAJA.make index 7fb821b..47aeefb 100644 --- a/RAJA.make +++ b/RAJA.make @@ -27,7 +27,7 @@ CXX_GNU = g++ CXX_CRAY = CC CXX_XL = xlc++ -CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp +CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp CXXFLAGS_CRAY = -O3 -hstd=c++11 CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 240f160..395a6ee 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -10,6 +10,10 @@ using RAJA::forall; using RAJA::RangeSegment; +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + template RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) : array_size(ARRAY_SIZE) @@ -18,9 +22,9 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) index_set.push_back(seg); #ifdef RAJA_TARGET_CPU - d_a = new T[ARRAY_SIZE]; - d_b = new T[ARRAY_SIZE]; - d_c = new T[ARRAY_SIZE]; + d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); + d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); #else cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); @@ -33,9 +37,9 @@ template RAJAStream::~RAJAStream() { #ifdef RAJA_TARGET_CPU - delete[] d_a; - delete[] d_b; - delete[] d_c; + free(d_a); + free(d_b); + free(d_c); #else cudaFree(d_a); cudaFree(d_b); @@ -46,10 +50,10 @@ RAJAStream::~RAJAStream() template void RAJAStream::init_arrays(T initA, T initB, T initC) { - T* a = d_a; - T* b = d_b; - T* c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { a[index] = initA; b[index] = initB; @@ -69,9 +73,9 @@ void RAJAStream::read_arrays( template void RAJAStream::copy() { - T* a = d_a; - T* c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT c = d_c; + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { c[index] = a[index]; }); @@ -80,10 +84,10 @@ void RAJAStream::copy() template void RAJAStream::mul() { - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { b[index] = scalar*c[index]; }); @@ -92,10 +96,10 @@ void RAJAStream::mul() template void RAJAStream::add() { - T* a = d_a; - T* b = d_b; - T* c = d_c; - forall(index_set, [=] RAJA_DEVICE (int index) + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { c[index] = a[index] + b[index]; }); @@ -104,11 +108,11 @@ void RAJAStream::add() template void RAJAStream::triad() { - T* a = d_a; - T* b = d_b; - T* c = d_c; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; + T* RAJA_RESTRICT c = d_c; const T scalar = startScalar; - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { a[index] = b[index] + scalar*c[index]; }); @@ -117,12 +121,12 @@ void RAJAStream::triad() template T RAJAStream::dot() { - T* a = d_a; - T* b = d_b; + T* RAJA_RESTRICT a = d_a; + T* RAJA_RESTRICT b = d_b; RAJA::ReduceSum sum(0.0); - forall(index_set, [=] RAJA_DEVICE (int index) + forall(index_set, [=] RAJA_DEVICE (RAJA::Index_type index) { sum += a[index] * b[index]; }); diff --git a/README.md b/README.md index 1e2792d..0f93be6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -GPU-STREAM +BabelStream ========== Measure memory transfer rates to/from global device memory on GPUs. @@ -16,9 +16,11 @@ Currently implemented are: - RAJA - SYCL +This code was previously called GPU-STREAM. + Website ------- -[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/) +[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/) Usage ----- @@ -36,6 +38,29 @@ Pass in extra flags via the `EXTRA_FLAGS` option. The binaries are named in the form `-stream`. +Building Kokkos +--------------- + +We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`. +``` +../generate_makefile.bash --prefix= --with-openmp --with-pthread --arch= --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152 +``` +For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`. +``` +../generate_makefile.bash --prefix= --with-cuda --with-openmp --with-pthread --arch= --with-cuda-options=enable_lambda +``` + +Building RAJA +------------- + +We use the following command to build RAJA using the Intel Compiler. +``` +cmake .. -DCMAKE_INSTALL_PREFIX= -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off +``` +For building with CUDA support, we use the following command. +``` +cmake .. -DCMAKE_INSTALL_PREFIX= -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off +``` Results ------- @@ -45,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to Citing ------ -You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference: +Please cite BabelStream via this reference: > Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany. -**Other GPU-STREAM publications:** +**Other BabelStream publications:** > Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. +You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). + +> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States. +You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html). diff --git a/main.cpp b/main.cpp index 5cd4ddc..8a2108c 100644 --- a/main.cpp +++ b/main.cpp @@ -15,7 +15,7 @@ #include #include -#define VERSION_STRING "3.1" +#define VERSION_STRING "3.2" #include "Stream.h" @@ -56,7 +56,7 @@ void parseArguments(int argc, char *argv[]); int main(int argc, char *argv[]) { std::cout - << "GPU-STREAM" << std::endl + << "BabelStream" << std::endl << "Version: " << VERSION_STRING << std::endl << "Implementation: " << IMPLEMENTATION_STRING << std::endl;