Add NVIDIA HPC SDK C++ parallel STL implementation
This commits adds an implementation using the C++ parallel STL. The Makefile uses the NVIDIA HPC SDK `nvc++` compiler with the `-stdpar` flag. Tested using the NVIDIA HPC SDK 20.9.
This commit is contained in:
parent
5182342403
commit
0855805ce2
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,6 +5,7 @@ omp-stream
|
|||||||
acc-stream
|
acc-stream
|
||||||
raja-stream
|
raja-stream
|
||||||
kokkos-stream
|
kokkos-stream
|
||||||
|
std-stream
|
||||||
sycl-stream
|
sycl-stream
|
||||||
hip-stream
|
hip-stream
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
|
|||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
- New implementation using the C++ parallel STL.
|
||||||
- Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD.
|
- Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD.
|
||||||
- Compiler options for Arm Clang added to OpenMP and Kokkos.
|
- Compiler options for Arm Clang added to OpenMP and Kokkos.
|
||||||
- Kokkos 3 build system (No code changes made).
|
- Kokkos 3 build system (No code changes made).
|
||||||
|
|||||||
@ -15,6 +15,7 @@ Currently implemented are:
|
|||||||
- CUDA
|
- CUDA
|
||||||
- OpenACC
|
- OpenACC
|
||||||
- OpenMP 3 and 4.5
|
- OpenMP 3 and 4.5
|
||||||
|
- C++ Parallel STL
|
||||||
- Kokkos
|
- Kokkos
|
||||||
- RAJA
|
- RAJA
|
||||||
- SYCL
|
- SYCL
|
||||||
|
|||||||
14
STD.make
Normal file
14
STD.make
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# For full license terms please see the LICENSE file distributed with this
|
||||||
|
# source code
|
||||||
|
|
||||||
|
CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD
|
||||||
|
STD_CXX=nvc++
|
||||||
|
|
||||||
|
std-stream: main.cpp STDStream.cpp
|
||||||
|
$(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f std-stream
|
||||||
89
STDStream.cpp
Normal file
89
STDStream.cpp
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
//
|
||||||
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
|
// source code
|
||||||
|
|
||||||
|
#include "STDStream.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <execution>
|
||||||
|
#include <numeric>
|
||||||
|
|
||||||
|
// There are three execution policies:
|
||||||
|
// auto exe_policy = std::execution::seq;
|
||||||
|
// auto exe_policy = std::execution::par;
|
||||||
|
auto exe_policy = std::execution::par_unseq;
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
STDStream<T>::STDStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
|
||||||
|
noexcept : array_size{ARRAY_SIZE}, a{a}, b{b}, c{c}
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void STDStream<T>::init_arrays(T initA, T initB, T initC)
|
||||||
|
{
|
||||||
|
std::fill(exe_policy, a, a+array_size, initA);
|
||||||
|
std::fill(exe_policy, b, b+array_size, initB);
|
||||||
|
std::fill(exe_policy, c, c+array_size, initC);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||||
|
{
|
||||||
|
std::copy(exe_policy, a, a+array_size, h_a.data());
|
||||||
|
std::copy(exe_policy, b, b+array_size, h_b.data());
|
||||||
|
std::copy(exe_policy, c, c+array_size, h_c.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void STDStream<T>::copy()
|
||||||
|
{
|
||||||
|
// c[i] = a[i]
|
||||||
|
std::copy(exe_policy, a, a+array_size, c) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void STDStream<T>::mul()
|
||||||
|
{
|
||||||
|
// b[i] = scalar * c[i];
|
||||||
|
std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void STDStream<T>::add()
|
||||||
|
{
|
||||||
|
// c[i] = a[i] + b[i];
|
||||||
|
std::transform(exe_policy, a, a+array_size, b, c, std::plus<T>());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void STDStream<T>::triad()
|
||||||
|
{
|
||||||
|
// a[i] = b[i] + scalar * c[i];
|
||||||
|
std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
T STDStream<T>::dot()
|
||||||
|
{
|
||||||
|
// sum = 0; sum += a[i]*b[i]; return sum;
|
||||||
|
return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void listDevices(void)
|
||||||
|
{
|
||||||
|
std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getDeviceName(const int)
|
||||||
|
{
|
||||||
|
return std::string("Device name unavailable");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string getDeviceDriver(const int)
|
||||||
|
{
|
||||||
|
return std::string("Device driver unavailable");
|
||||||
|
}
|
||||||
|
template class STDStream<float>;
|
||||||
|
template class STDStream<double>;
|
||||||
38
STDStream.h
Normal file
38
STDStream.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
//
|
||||||
|
// For full license terms please see the LICENSE file distributed with this
|
||||||
|
// source code
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include "Stream.h"
|
||||||
|
|
||||||
|
#define IMPLEMENTATION_STRING "STD"
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
class STDStream : public Stream<T>
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
// Size of arrays
|
||||||
|
unsigned int array_size;
|
||||||
|
|
||||||
|
// Device side pointers
|
||||||
|
T *a;
|
||||||
|
T *b;
|
||||||
|
T *c;
|
||||||
|
|
||||||
|
public:
|
||||||
|
STDStream(const unsigned int, T*, T*, T*, int);
|
||||||
|
~STDStream() = default;
|
||||||
|
|
||||||
|
virtual void copy() override;
|
||||||
|
virtual void add() override;
|
||||||
|
virtual void mul() override;
|
||||||
|
virtual void triad() override;
|
||||||
|
virtual T dot() override;
|
||||||
|
|
||||||
|
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||||
|
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||||
|
};
|
||||||
10
main.cpp
10
main.cpp
@ -21,6 +21,8 @@
|
|||||||
|
|
||||||
#if defined(CUDA)
|
#if defined(CUDA)
|
||||||
#include "CUDAStream.h"
|
#include "CUDAStream.h"
|
||||||
|
#elif defined(STD)
|
||||||
|
#include "STDStream.h"
|
||||||
#elif defined(HIP)
|
#elif defined(HIP)
|
||||||
#include "HIPStream.h"
|
#include "HIPStream.h"
|
||||||
#elif defined(HC)
|
#elif defined(HC)
|
||||||
@ -162,6 +164,10 @@ void run()
|
|||||||
// Use the Kokkos implementation
|
// Use the Kokkos implementation
|
||||||
stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
|
stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|
||||||
|
#elif defined(STD)
|
||||||
|
// Use the STD implementation
|
||||||
|
stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||||
|
|
||||||
#elif defined(ACC)
|
#elif defined(ACC)
|
||||||
// Use the OpenACC implementation
|
// Use the OpenACC implementation
|
||||||
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||||
@ -363,6 +369,10 @@ void run_triad()
|
|||||||
// Use the OpenACC implementation
|
// Use the OpenACC implementation
|
||||||
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||||
|
|
||||||
|
#elif defined(STD)
|
||||||
|
// Use the STD implementation
|
||||||
|
stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||||
|
|
||||||
#elif defined(SYCL)
|
#elif defined(SYCL)
|
||||||
// Use the SYCL implementation
|
// Use the SYCL implementation
|
||||||
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
|
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user