Merge branch 'master' of https://github.com/UoB-HPC/BabelStream into rocm_hc_support

This commit is contained in:
Peter Steinbach 2017-07-25 17:05:31 +02:00
commit 26279688d1
7 changed files with 95 additions and 47 deletions

View File

@ -39,7 +39,7 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
View<double*, DEVICE> a(*d_a);
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
a[index] = initA;
b[index] = initB;
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
c[index] = a[index];
});
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
View<double*, DEVICE> c(*d_c);
const T scalar = startScalar;
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
b[index] = scalar*c[index];
});
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
c[index] = a[index] + b[index];
});
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
View<double*, DEVICE> c(*d_c);
const T scalar = startScalar;
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
a[index] = b[index] + scalar*c[index];
});
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
T sum = 0.0;
parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
{
tmp += a[index] * b[index];
}, sum);

View File

@ -3,6 +3,21 @@ default: kokkos-stream
include $(KOKKOS_PATH)/Makefile.kokkos
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU INTEL
endef
$(info $(compiler_help))
COMPILER=GNU
endif
COMPILER_GNU = g++
COMPILER_INTEL = icpc -qopt-streaming-stores=always
CXX = $(COMPILER_$(COMPILER))
ifndef TARGET
define target_help
Set TARGET to change to offload device. Defaulting to CPU.

14
LICENSE
View File

@ -12,22 +12,22 @@
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* 3a. In order to be referred to as "BabelStream benchmark results",
* published results must be in conformance to the BabelStream
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* accordance with the BabelStream Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* "tuned BabelStream benchmark results"
* "based on a variant of the BabelStream benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* 3c. Submission of results to the BabelStream benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.

View File

@ -27,7 +27,7 @@ CXX_GNU = g++
CXX_CRAY = CC
CXX_XL = xlc++
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp
CXXFLAGS_CRAY = -O3 -hstd=c++11
CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded

View File

@ -10,6 +10,10 @@
using RAJA::forall;
using RAJA::RangeSegment;
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
template <class T>
RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
: array_size(ARRAY_SIZE)
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
index_set.push_back(seg);
#ifdef RAJA_TARGET_CPU
d_a = new T[ARRAY_SIZE];
d_b = new T[ARRAY_SIZE];
d_c = new T[ARRAY_SIZE];
d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
#else
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@ -33,9 +37,9 @@ template <class T>
RAJAStream<T>::~RAJAStream()
{
#ifdef RAJA_TARGET_CPU
delete[] d_a;
delete[] d_b;
delete[] d_c;
free(d_a);
free(d_b);
free(d_c);
#else
cudaFree(d_a);
cudaFree(d_b);
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
template <class T>
void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
a[index] = initA;
b[index] = initB;
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
template <class T>
void RAJAStream<T>::copy()
{
T* a = d_a;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
c[index] = a[index];
});
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
template <class T>
void RAJAStream<T>::mul()
{
T* b = d_b;
T* c = d_c;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
const T scalar = startScalar;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
b[index] = scalar*c[index];
});
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
template <class T>
void RAJAStream<T>::add()
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
c[index] = a[index] + b[index];
});
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
template <class T>
void RAJAStream<T>::triad()
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
const T scalar = startScalar;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
a[index] = b[index] + scalar*c[index];
});
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
template <class T>
T RAJAStream<T>::dot()
{
T* a = d_a;
T* b = d_b;
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
sum += a[index] * b[index];
});

View File

@ -1,4 +1,4 @@
GPU-STREAM
BabelStream
==========
Measure memory transfer rates to/from global device memory on GPUs.
@ -16,9 +16,11 @@ Currently implemented are:
- RAJA
- SYCL
This code was previously called GPU-STREAM.
Website
-------
[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/)
[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
Usage
-----
@ -36,6 +38,29 @@ Pass in extra flags via the `EXTRA_FLAGS` option.
The binaries are named in the form `<model>-stream`.
Building Kokkos
---------------
We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
```
../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
```
For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
```
../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
```
Building RAJA
-------------
We use the following command to build RAJA using the Intel Compiler.
```
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
```
For building with CUDA support, we use the following command.
```
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
```
Results
-------
@ -45,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
Citing
------
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
Please cite BabelStream via this reference:
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
**Other GPU-STREAM publications:**
**Other BabelStream publications:**
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).

View File

@ -15,7 +15,7 @@
#include <iomanip>
#include <cstring>
#define VERSION_STRING "3.1"
#define VERSION_STRING "3.2"
#include "Stream.h"
@ -56,7 +56,7 @@ void parseArguments(int argc, char *argv[]);
int main(int argc, char *argv[])
{
std::cout
<< "GPU-STREAM" << std::endl
<< "BabelStream" << std::endl
<< "Version: " << VERSION_STRING << std::endl
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;