Add NVIDIA HPC SDK C++ parallel STL implementation

This commits adds an implementation using the C++ parallel STL. The Makefile uses the NVIDIA HPC SDK `nvc++` compiler with the `-stdpar` flag. Tested using the NVIDIA HPC SDK 20.9.
2020-11-17 03:05:17 -08:00 · 2020-11-17 03:05:17 -08:00 · 0855805ce2
commit 0855805ce2
parent 5182342403
7 changed files with 154 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@ omp-stream
 acc-stream
 raja-stream
 kokkos-stream
+std-stream
 sycl-stream
 hip-stream

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
 ## [Unreleased]

 ### Added
+- New implementation using the C++ parallel STL.
 - Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD.
 - Compiler options for Arm Clang added to OpenMP and Kokkos.
 - Kokkos 3 build system (No code changes made).
--- a/README.md
+++ b/README.md
@ -15,6 +15,7 @@ Currently implemented are:
  - CUDA
  - OpenACC
  - OpenMP 3 and 4.5
+  - C++ Parallel STL
  - Kokkos
  - RAJA
  - SYCL
--- a/STD.make
+++ b/STD.make
@ -0,0 +1,14 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# For full license terms please see the LICENSE file distributed with this
+# source code
+
+CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD
+STD_CXX=nvc++
+
+std-stream: main.cpp STDStream.cpp
+	$(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f std-stream
--- a/STDStream.cpp
+++ b/STDStream.cpp
@ -0,0 +1,89 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "STDStream.h"
+
+#include <algorithm>
+#include <execution>
+#include <numeric>
+
+// There are three execution policies:
+// auto exe_policy = std::execution::seq;
+// auto exe_policy = std::execution::par;
+auto exe_policy = std::execution::par_unseq;
+
+template <class T>
+STDStream<T>::STDStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
+  noexcept : array_size{ARRAY_SIZE}, a{a}, b{b}, c{c}
+{
+}
+
+template <class T>
+void STDStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  std::fill(exe_policy, a, a+array_size, initA);
+  std::fill(exe_policy, b, b+array_size, initB);
+  std::fill(exe_policy, c, c+array_size, initC);
+}
+
+template <class T>
+void STDStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
+{
+  std::copy(exe_policy, a, a+array_size, h_a.data());
+  std::copy(exe_policy, b, b+array_size, h_b.data());
+  std::copy(exe_policy, c, c+array_size, h_c.data());
+}
+
+template <class T>
+void STDStream<T>::copy()
+{
+  // c[i] = a[i]
+  std::copy(exe_policy, a, a+array_size, c) ;
+}
+
+template <class T>
+void STDStream<T>::mul()
+{
+  //  b[i] = scalar * c[i];
+  std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; });
+}
+
+template <class T>
+void STDStream<T>::add()
+{
+  //  c[i] = a[i] + b[i];
+  std::transform(exe_policy, a, a+array_size, b, c, std::plus<T>());
+}
+
+template <class T>
+void STDStream<T>::triad()
+{
+  //  a[i] = b[i] + scalar * c[i];
+  std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; });
+}
+
+template <class T>
+T STDStream<T>::dot()
+{
+  // sum = 0; sum += a[i]*b[i]; return sum;
+  return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0);
+}
+
+void listDevices(void)
+{
+  std::cout << "Listing devices is not supported by the Parallel STL" << std::endl;
+}
+
+std::string getDeviceName(const int)
+{
+  return std::string("Device name unavailable");
+}
+
+std::string getDeviceDriver(const int)
+{
+  return std::string("Device driver unavailable");
+}
+template class STDStream<float>;
+template class STDStream<double>;
--- a/STDStream.h
+++ b/STDStream.h
@ -0,0 +1,38 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "STD"
+
+template <class T>
+class STDStream : public Stream<T>
+{
+  protected:
+    // Size of arrays
+    unsigned int array_size;
+
+    // Device side pointers
+    T *a;
+    T *b;
+    T *c;
+
+  public:
+    STDStream(const unsigned int, T*, T*, T*, int);
+    ~STDStream() = default;
+
+    virtual void copy() override;
+    virtual void add() override;
+    virtual void mul() override;
+    virtual void triad() override;
+    virtual T dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+};
--- a/main.cpp
+++ b/main.cpp
@ -21,6 +21,8 @@

 #if defined(CUDA)
 #include "CUDAStream.h"
+#elif defined(STD)
+#include "STDStream.h"
 #elif defined(HIP)
 #include "HIPStream.h"
 #elif defined(HC)
@ -162,6 +164,10 @@ void run()
  // Use the Kokkos implementation
  stream = new KokkosStream<T>(ARRAY_SIZE, deviceIndex);

+#elif defined(STD)
+  // Use the STD implementation
+  stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
+
 #elif defined(ACC)
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
@ -363,6 +369,10 @@ void run_triad()
  // Use the OpenACC implementation
  stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);

+#elif defined(STD)
+  // Use the STD implementation
+  stream = new STDStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
+
 #elif defined(SYCL)
  // Use the SYCL implementation
  stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);