Merge branch 'master' of https://github.com/UoB-HPC/BabelStream into rocm_hc_support
This commit is contained in:
commit
26279688d1
@ -39,7 +39,7 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
View<double*, DEVICE> a(*d_a);
|
||||
View<double*, DEVICE> b(*d_b);
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
a[index] = initA;
|
||||
b[index] = initB;
|
||||
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
|
||||
View<double*, DEVICE> b(*d_b);
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
c[index] = a[index];
|
||||
});
|
||||
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
const T scalar = startScalar;
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
b[index] = scalar*c[index];
|
||||
});
|
||||
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
|
||||
View<double*, DEVICE> b(*d_b);
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
c[index] = a[index] + b[index];
|
||||
});
|
||||
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
const T scalar = startScalar;
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
a[index] = b[index] + scalar*c[index];
|
||||
});
|
||||
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
|
||||
|
||||
T sum = 0.0;
|
||||
|
||||
parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
|
||||
parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
|
||||
{
|
||||
tmp += a[index] * b[index];
|
||||
}, sum);
|
||||
|
||||
15
Kokkos.make
15
Kokkos.make
@ -3,6 +3,21 @@ default: kokkos-stream
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifndef COMPILER
|
||||
define compiler_help
|
||||
Set COMPILER to change flags (defaulting to GNU).
|
||||
Available compilers are:
|
||||
GNU INTEL
|
||||
|
||||
endef
|
||||
$(info $(compiler_help))
|
||||
COMPILER=GNU
|
||||
endif
|
||||
|
||||
COMPILER_GNU = g++
|
||||
COMPILER_INTEL = icpc -qopt-streaming-stores=always
|
||||
CXX = $(COMPILER_$(COMPILER))
|
||||
|
||||
ifndef TARGET
|
||||
define target_help
|
||||
Set TARGET to change to offload device. Defaulting to CPU.
|
||||
|
||||
14
LICENSE
14
LICENSE
@ -12,22 +12,22 @@
|
||||
* 3. You are free to publish results obtained from running this
|
||||
* program, or from works that you derive from this program,
|
||||
* with the following limitations:
|
||||
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
|
||||
* published results must be in conformance to the GPU-STREAM
|
||||
* 3a. In order to be referred to as "BabelStream benchmark results",
|
||||
* published results must be in conformance to the BabelStream
|
||||
* Run Rules published at
|
||||
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
|
||||
* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
|
||||
* and incorporated herein by reference.
|
||||
* The copyright holders retain the
|
||||
* right to determine conformity with the Run Rules.
|
||||
* 3b. Results based on modified source code or on runs not in
|
||||
* accordance with the GPU-STREAM Run Rules must be clearly
|
||||
* accordance with the BabelStream Run Rules must be clearly
|
||||
* labelled whenever they are published. Examples of
|
||||
* proper labelling include:
|
||||
* "tuned GPU-STREAM benchmark results"
|
||||
* "based on a variant of the GPU-STREAM benchmark code"
|
||||
* "tuned BabelStream benchmark results"
|
||||
* "based on a variant of the BabelStream benchmark code"
|
||||
* Other comparable, clear and reasonable labelling is
|
||||
* acceptable.
|
||||
* 3c. Submission of results to the GPU-STREAM benchmark web site
|
||||
* 3c. Submission of results to the BabelStream benchmark web site
|
||||
* is encouraged, but not required.
|
||||
* 4. Use of this program or creation of derived works based on this
|
||||
* program constitutes acceptance of these licensing restrictions.
|
||||
|
||||
@ -27,7 +27,7 @@ CXX_GNU = g++
|
||||
CXX_CRAY = CC
|
||||
CXX_XL = xlc++
|
||||
|
||||
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp
|
||||
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
|
||||
CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp
|
||||
CXXFLAGS_CRAY = -O3 -hstd=c++11
|
||||
CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
|
||||
|
||||
@ -10,6 +10,10 @@
|
||||
using RAJA::forall;
|
||||
using RAJA::RangeSegment;
|
||||
|
||||
#ifndef ALIGNMENT
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
: array_size(ARRAY_SIZE)
|
||||
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
index_set.push_back(seg);
|
||||
|
||||
#ifdef RAJA_TARGET_CPU
|
||||
d_a = new T[ARRAY_SIZE];
|
||||
d_b = new T[ARRAY_SIZE];
|
||||
d_c = new T[ARRAY_SIZE];
|
||||
d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
#else
|
||||
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||
@ -33,9 +37,9 @@ template <class T>
|
||||
RAJAStream<T>::~RAJAStream()
|
||||
{
|
||||
#ifdef RAJA_TARGET_CPU
|
||||
delete[] d_a;
|
||||
delete[] d_b;
|
||||
delete[] d_c;
|
||||
free(d_a);
|
||||
free(d_b);
|
||||
free(d_c);
|
||||
#else
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_b);
|
||||
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
|
||||
template <class T>
|
||||
void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
a[index] = initA;
|
||||
b[index] = initB;
|
||||
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
|
||||
template <class T>
|
||||
void RAJAStream<T>::copy()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
c[index] = a[index];
|
||||
});
|
||||
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
|
||||
template <class T>
|
||||
void RAJAStream<T>::mul()
|
||||
{
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
const T scalar = startScalar;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
b[index] = scalar*c[index];
|
||||
});
|
||||
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
|
||||
template <class T>
|
||||
void RAJAStream<T>::add()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
c[index] = a[index] + b[index];
|
||||
});
|
||||
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
|
||||
template <class T>
|
||||
void RAJAStream<T>::triad()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
const T scalar = startScalar;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
a[index] = b[index] + scalar*c[index];
|
||||
});
|
||||
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
|
||||
template <class T>
|
||||
T RAJAStream<T>::dot()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
|
||||
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
|
||||
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
sum += a[index] * b[index];
|
||||
});
|
||||
|
||||
37
README.md
37
README.md
@ -1,4 +1,4 @@
|
||||
GPU-STREAM
|
||||
BabelStream
|
||||
==========
|
||||
|
||||
Measure memory transfer rates to/from global device memory on GPUs.
|
||||
@ -16,9 +16,11 @@ Currently implemented are:
|
||||
- RAJA
|
||||
- SYCL
|
||||
|
||||
This code was previously called GPU-STREAM.
|
||||
|
||||
Website
|
||||
-------
|
||||
[uob-hpc.github.io/GPU-STREAM/](https://uob-hpc.github.io/GPU-STREAM/)
|
||||
[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
|
||||
|
||||
Usage
|
||||
-----
|
||||
@ -36,6 +38,29 @@ Pass in extra flags via the `EXTRA_FLAGS` option.
|
||||
|
||||
The binaries are named in the form `<model>-stream`.
|
||||
|
||||
Building Kokkos
|
||||
---------------
|
||||
|
||||
We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
|
||||
```
|
||||
../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
|
||||
```
|
||||
For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
|
||||
```
|
||||
../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
|
||||
```
|
||||
|
||||
Building RAJA
|
||||
-------------
|
||||
|
||||
We use the following command to build RAJA using the Intel Compiler.
|
||||
```
|
||||
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
|
||||
```
|
||||
For building with CUDA support, we use the following command.
|
||||
```
|
||||
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
|
||||
```
|
||||
|
||||
Results
|
||||
-------
|
||||
@ -45,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
|
||||
Citing
|
||||
------
|
||||
|
||||
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
|
||||
Please cite BabelStream via this reference:
|
||||
|
||||
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
|
||||
|
||||
**Other GPU-STREAM publications:**
|
||||
**Other BabelStream publications:**
|
||||
|
||||
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
|
||||
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
|
||||
|
||||
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
|
||||
You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
|
||||
|
||||
|
||||
|
||||
|
||||
4
main.cpp
4
main.cpp
@ -15,7 +15,7 @@
|
||||
#include <iomanip>
|
||||
#include <cstring>
|
||||
|
||||
#define VERSION_STRING "3.1"
|
||||
#define VERSION_STRING "3.2"
|
||||
|
||||
#include "Stream.h"
|
||||
|
||||
@ -56,7 +56,7 @@ void parseArguments(int argc, char *argv[]);
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
std::cout
|
||||
<< "GPU-STREAM" << std::endl
|
||||
<< "BabelStream" << std::endl
|
||||
<< "Version: " << VERSION_STRING << std::endl
|
||||
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user