Merge pull request #84 from gonzalobg/cxx_parallel_stl

Add NVIDIA HPC SDK C++ parallel STL implementation
This commit is contained in:
Tom Deakin 2020-12-03 14:15:45 +00:00 committed by GitHub
commit f271d5563d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 154 additions and 0 deletions

1
.gitignore vendored
View File

@ -5,6 +5,7 @@ omp-stream
acc-stream acc-stream
raja-stream raja-stream
kokkos-stream kokkos-stream
std-stream
sycl-stream sycl-stream
hip-stream hip-stream

View File

@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
## [Unreleased] ## [Unreleased]
### Added ### Added
- New implementation using the C++ parallel STL.
- Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD. - Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD.
- Compiler options for Arm Clang added to OpenMP and Kokkos. - Compiler options for Arm Clang added to OpenMP and Kokkos.
- Kokkos 3 build system (No code changes made). - Kokkos 3 build system (No code changes made).

View File

@ -15,6 +15,7 @@ Currently implemented are:
- CUDA - CUDA
- OpenACC - OpenACC
- OpenMP 3 and 4.5 - OpenMP 3 and 4.5
- C++ Parallel STL
- Kokkos - Kokkos
- RAJA - RAJA
- SYCL - SYCL

14
STD.make Normal file
View File

@ -0,0 +1,14 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# For full license terms please see the LICENSE file distributed with this
# source code
CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD
STD_CXX=nvc++
std-stream: main.cpp STDStream.cpp
$(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f std-stream

89
STDStream.cpp Normal file
View File

@ -0,0 +1,89 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "STDStream.h"
#include <algorithm>
#include <execution>
#include <numeric>
// There are three execution policies:
// auto exe_policy = std::execution::seq;
// auto exe_policy = std::execution::par;
auto exe_policy = std::execution::par_unseq;
template <class T>
STDStream<T>::STDStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
noexcept : array_size{ARRAY_SIZE}, a{a}, b{b}, c{c}
{
}
template <class T>
void STDStream<T>::init_arrays(T initA, T initB, T initC)
{
std::fill(exe_policy, a, a+array_size, initA);
std::fill(exe_policy, b, b+array_size, initB);
std::fill(exe_policy, c, c+array_size, initC);
}
template <class T>
void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
std::copy(exe_policy, a, a+array_size, h_a.data());
std::copy(exe_policy, b, b+array_size, h_b.data());
std::copy(exe_policy, c, c+array_size, h_c.data());
}
template <class T>
void STDStream<T>::copy()
{
// c[i] = a[i]
std::copy(exe_policy, a, a+array_size, c) ;
}
template <class T>
void STDStream<T>::mul()
{
// b[i] = scalar * c[i];
std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; });
}
template <class T>
void STDStream<T>::add()
{
// c[i] = a[i] + b[i];
std::transform(exe_policy, a, a+array_size, b, c, std::plus<T>());
}
template <class T>
void STDStream<T>::triad()
{
// a[i] = b[i] + scalar * c[i];
std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
}
template <class T>
T STDStream<T>::dot()
{
// sum = 0; sum += a[i]*b[i]; return sum;
return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0);
}
void listDevices(void)
{
std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
}
std::string getDeviceName(const int)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class STDStream<float>;
template class STDStream<double>;

38
STDStream.h Normal file
View File

@ -0,0 +1,38 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#define IMPLEMENTATION_STRING "STD"
template <class T>
class STDStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers
T *a;
T *b;
T *c;
public:
STDStream(const unsigned int, T*, T*, T*, int);
~STDStream() = default;
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -21,6 +21,8 @@
#if defined(CUDA) #if defined(CUDA)
#include "CUDAStream.h" #include "CUDAStream.h"
#elif defined(STD)
#include "STDStream.h"
#elif defined(HIP) #elif defined(HIP)
#include "HIPStream.h" #include "HIPStream.h"
#elif defined(HC) #elif defined(HC)
@ -162,6 +164,10 @@ void run()
// Use the Kokkos implementation // Use the Kokkos implementation
stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex); stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(STD)
// Use the STD implementation
stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
#elif defined(ACC) #elif defined(ACC)
// Use the OpenACC implementation // Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
@ -363,6 +369,10 @@ void run_triad()
// Use the OpenACC implementation // Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
#elif defined(STD)
// Use the STD implementation
stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
#elif defined(SYCL) #elif defined(SYCL)
// Use the SYCL implementation // Use the SYCL implementation
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex); stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);