Add NVIDIA HPC SDK C++ parallel STL implementation
This commits adds an implementation using the C++ parallel STL. The Makefile uses the NVIDIA HPC SDK `nvc++` compiler with the `-stdpar` flag. Tested using the NVIDIA HPC SDK 20.9.
This commit is contained in:
parent
5182342403
commit
0855805ce2
1
.gitignore
vendored
1
.gitignore
vendored
@ -5,6 +5,7 @@ omp-stream
|
||||
acc-stream
|
||||
raja-stream
|
||||
kokkos-stream
|
||||
std-stream
|
||||
sycl-stream
|
||||
hip-stream
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- New implementation using the C++ parallel STL.
|
||||
- Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD.
|
||||
- Compiler options for Arm Clang added to OpenMP and Kokkos.
|
||||
- Kokkos 3 build system (No code changes made).
|
||||
|
||||
@ -15,6 +15,7 @@ Currently implemented are:
|
||||
- CUDA
|
||||
- OpenACC
|
||||
- OpenMP 3 and 4.5
|
||||
- C++ Parallel STL
|
||||
- Kokkos
|
||||
- RAJA
|
||||
- SYCL
|
||||
|
||||
14
STD.make
Normal file
14
STD.make
Normal file
@ -0,0 +1,14 @@
|
||||
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# For full license terms please see the LICENSE file distributed with this
|
||||
# source code
|
||||
|
||||
CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD
|
||||
STD_CXX=nvc++
|
||||
|
||||
std-stream: main.cpp STDStream.cpp
|
||||
$(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f std-stream
|
||||
89
STDStream.cpp
Normal file
89
STDStream.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
// source code
|
||||
|
||||
#include "STDStream.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <execution>
|
||||
#include <numeric>
|
||||
|
||||
// There are three execution policies:
|
||||
// auto exe_policy = std::execution::seq;
|
||||
// auto exe_policy = std::execution::par;
|
||||
auto exe_policy = std::execution::par_unseq;
|
||||
|
||||
template <class T>
|
||||
STDStream<T>::STDStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
|
||||
noexcept : array_size{ARRAY_SIZE}, a{a}, b{b}, c{c}
|
||||
{
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
std::fill(exe_policy, a, a+array_size, initA);
|
||||
std::fill(exe_policy, b, b+array_size, initB);
|
||||
std::fill(exe_policy, c, c+array_size, initC);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
|
||||
{
|
||||
std::copy(exe_policy, a, a+array_size, h_a.data());
|
||||
std::copy(exe_policy, b, b+array_size, h_b.data());
|
||||
std::copy(exe_policy, c, c+array_size, h_c.data());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDStream<T>::copy()
|
||||
{
|
||||
// c[i] = a[i]
|
||||
std::copy(exe_policy, a, a+array_size, c) ;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDStream<T>::mul()
|
||||
{
|
||||
// b[i] = scalar * c[i];
|
||||
std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; });
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDStream<T>::add()
|
||||
{
|
||||
// c[i] = a[i] + b[i];
|
||||
std::transform(exe_policy, a, a+array_size, b, c, std::plus<T>());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void STDStream<T>::triad()
|
||||
{
|
||||
// a[i] = b[i] + scalar * c[i];
|
||||
std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T STDStream<T>::dot()
|
||||
{
|
||||
// sum = 0; sum += a[i]*b[i]; return sum;
|
||||
return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0);
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
{
|
||||
std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
|
||||
}
|
||||
|
||||
std::string getDeviceName(const int)
|
||||
{
|
||||
return std::string("Device name unavailable");
|
||||
}
|
||||
|
||||
std::string getDeviceDriver(const int)
|
||||
{
|
||||
return std::string("Device driver unavailable");
|
||||
}
|
||||
template class STDStream<float>;
|
||||
template class STDStream<double>;
|
||||
38
STDStream.h
Normal file
38
STDStream.h
Normal file
@ -0,0 +1,38 @@
|
||||
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
//
|
||||
// For full license terms please see the LICENSE file distributed with this
|
||||
// source code
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include "Stream.h"
|
||||
|
||||
#define IMPLEMENTATION_STRING "STD"
|
||||
|
||||
template <class T>
|
||||
class STDStream : public Stream<T>
|
||||
{
|
||||
protected:
|
||||
// Size of arrays
|
||||
unsigned int array_size;
|
||||
|
||||
// Device side pointers
|
||||
T *a;
|
||||
T *b;
|
||||
T *c;
|
||||
|
||||
public:
|
||||
STDStream(const unsigned int, T*, T*, T*, int);
|
||||
~STDStream() = default;
|
||||
|
||||
virtual void copy() override;
|
||||
virtual void add() override;
|
||||
virtual void mul() override;
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
};
|
||||
10
main.cpp
10
main.cpp
@ -21,6 +21,8 @@
|
||||
|
||||
#if defined(CUDA)
|
||||
#include "CUDAStream.h"
|
||||
#elif defined(STD)
|
||||
#include "STDStream.h"
|
||||
#elif defined(HIP)
|
||||
#include "HIPStream.h"
|
||||
#elif defined(HC)
|
||||
@ -162,6 +164,10 @@ void run()
|
||||
// Use the Kokkos implementation
|
||||
stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
|
||||
|
||||
#elif defined(STD)
|
||||
// Use the STD implementation
|
||||
stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||
|
||||
#elif defined(ACC)
|
||||
// Use the OpenACC implementation
|
||||
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||
@ -363,6 +369,10 @@ void run_triad()
|
||||
// Use the OpenACC implementation
|
||||
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||
|
||||
#elif defined(STD)
|
||||
// Use the STD implementation
|
||||
stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
|
||||
|
||||
#elif defined(SYCL)
|
||||
// Use the SYCL implementation
|
||||
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user