Merge pull request #84 from gonzalobg/cxx_parallel_stl

Add NVIDIA HPC SDK C++ parallel STL implementation
2020-12-03 14:15:45 +00:00 · 2020-12-03 14:15:45 +00:00 · f271d5563d
commit f271d5563d
parent 5182342403 0855805ce2
7 changed files with 154 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@ omp-stream
 acc-stream
 raja-stream
 kokkos-stream
 std-stream
 sycl-stream
 hip-stream
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
 ## [Unreleased]
 ### Added
 - New implementation using the C++ parallel STL.
 - Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD.
 - Compiler options for Arm Clang added to OpenMP and Kokkos.
 - Kokkos 3 build system (No code changes made).
--- a/README.md
+++ b/README.md
@ -15,6 +15,7 @@ Currently implemented are:
  - CUDA
  - OpenACC
  - OpenMP 3 and 4.5
  - C++ Parallel STL
  - Kokkos
  - RAJA
  - SYCL
--- a/STD.make
+++ b/STD.make
@ -0,0 +1,14 @@
 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
 # For full license terms please see the LICENSE file distributed with this
 # source code
 CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD
 STD_CXX=nvc++
 std-stream: main.cpp STDStream.cpp
 	$(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@
 .PHONY: clean
 clean:
 	rm -f std-stream
--- a/STDStream.cpp
+++ b/STDStream.cpp
@ -0,0 +1,89 @@
 // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #include "STDStream.h"
 #include <algorithm>
 #include <execution>
 #include <numeric>
 // There are three execution policies:
 // auto exe_policy = std::execution::seq;
 // auto exe_policy = std::execution::par;
 auto exe_policy = std::execution::par_unseq;
 template <class T>
 STDStream<T>::STDStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
  noexcept : array_size{ARRAY_SIZE}, a{a}, b{b}, c{c}
 {
 }
 template <class T>
 void STDStream<T>::init_arrays(T initA, T initB, T initC)
 {
  std::fill(exe_policy, a, a+array_size, initA);
  std::fill(exe_policy, b, b+array_size, initB);
  std::fill(exe_policy, c, c+array_size, initC);
 }
 template <class T>
 void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
  std::copy(exe_policy, a, a+array_size, h_a.data());
  std::copy(exe_policy, b, b+array_size, h_b.data());
  std::copy(exe_policy, c, c+array_size, h_c.data());
 }
 template <class T>
 void STDStream<T>::copy()
 {
  // c[i] = a[i]
  std::copy(exe_policy, a, a+array_size, c) ;
 }
 template <class T>
 void STDStream<T>::mul()
 {
  //  b[i] = scalar * c[i];
  std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; });
 }
 template <class T>
 void STDStream<T>::add()
 {
  //  c[i] = a[i] + b[i];
  std::transform(exe_policy, a, a+array_size, b, c, std::plus<T>());
 }
 template <class T>
 void STDStream<T>::triad()
 {
  //  a[i] = b[i] + scalar * c[i];
  std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
 }
 template <class T>
 T STDStream<T>::dot()
 {
  // sum = 0; sum += a[i]*b[i]; return sum;
  return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0);
 }
 void listDevices(void)
 {
  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
 }
 std::string getDeviceName(const int)
 {
  return std::string("Device name unavailable");
 }
 std::string getDeviceDriver(const int)
 {
  return std::string("Device driver unavailable");
 }
 template class STDStream<float>;
 template class STDStream<double>;
--- a/STDStream.h
+++ b/STDStream.h
@ -0,0 +1,38 @@
 // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 //
 // For full license terms please see the LICENSE file distributed with this
 // source code
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include "Stream.h"
 #define IMPLEMENTATION_STRING "STD"
 template <class T>
 class STDStream : public Stream<T>
 {
  protected:
    // Size of arrays
    unsigned int array_size;
    // Device side pointers
    T *a;
    T *b;
    T *c;
  public:
    STDStream(const unsigned int, T*, T*, T*, int);
    ~STDStream() = default;
    virtual void copy() override;
    virtual void add() override;
    virtual void mul() override;
    virtual void triad() override;
    virtual T dot() override;
    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/main.cpp
+++ b/main.cpp
@ -21,6 +21,8 @@
 #if defined(CUDA)
 #include "CUDAStream.h"
 #elif defined(STD)
 #include "STDStream.h"
 #elif defined(HIP)
 #include "HIPStream.h"
 #elif defined(HC)
@ -162,6 +164,10 @@ void run()
  // Use the Kokkos implementation
  stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);
 #elif defined(STD)
  // Use the STD implementation
  stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
@ -363,6 +369,10 @@ void run_triad()
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 #elif defined(STD)
  // Use the STD implementation
  stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 #elif defined(SYCL)
  // Use the SYCL implementation
  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);