From 0855805ce2ac86fa1c47082db96d5db48abede40 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Tue, 17 Nov 2020 03:05:17 -0800 Subject: [PATCH] Add NVIDIA HPC SDK C++ parallel STL implementation This commits adds an implementation using the C++ parallel STL. The Makefile uses the NVIDIA HPC SDK `nvc++` compiler with the `-stdpar` flag. Tested using the NVIDIA HPC SDK 20.9. --- .gitignore | 1 + CHANGELOG.md | 1 + README.md | 1 + STD.make | 14 ++++++++ STDStream.cpp | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++ STDStream.h | 38 ++++++++++++++++++++++ main.cpp | 10 ++++++ 7 files changed, 154 insertions(+) create mode 100644 STD.make create mode 100644 STDStream.cpp create mode 100644 STDStream.h diff --git a/.gitignore b/.gitignore index 90da5e2..6a98d24 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ omp-stream acc-stream raja-stream kokkos-stream +std-stream sycl-stream hip-stream diff --git a/CHANGELOG.md b/CHANGELOG.md index 75402f2..077580e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file. ## [Unreleased] ### Added +- New implementation using the C++ parallel STL. - Compiler options for OpenMP and OpenACC GNU offloading to NVIDIA and AMD. - Compiler options for Arm Clang added to OpenMP and Kokkos. - Kokkos 3 build system (No code changes made). diff --git a/README.md b/README.md index e719368..6177b02 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Currently implemented are: - CUDA - OpenACC - OpenMP 3 and 4.5 + - C++ Parallel STL - Kokkos - RAJA - SYCL diff --git a/STD.make b/STD.make new file mode 100644 index 0000000..3225a08 --- /dev/null +++ b/STD.make @@ -0,0 +1,14 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# For full license terms please see the LICENSE file distributed with this +# source code + +CXXFLAGS=-O3 -std=c++17 -stdpar -DSTD +STD_CXX=nvc++ + +std-stream: main.cpp STDStream.cpp + $(STD_CXX) $(CXXFLAGS) $^ $(EXTRA_FLAGS) -o $@ + +.PHONY: clean +clean: + rm -f std-stream diff --git a/STDStream.cpp b/STDStream.cpp new file mode 100644 index 0000000..15ee68f --- /dev/null +++ b/STDStream.cpp @@ -0,0 +1,89 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "STDStream.h" + +#include +#include +#include + +// There are three execution policies: +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +auto exe_policy = std::execution::par_unseq; + +template +STDStream::STDStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) + noexcept : array_size{ARRAY_SIZE}, a{a}, b{b}, c{c} +{ +} + +template +void STDStream::init_arrays(T initA, T initB, T initC) +{ + std::fill(exe_policy, a, a+array_size, initA); + std::fill(exe_policy, b, b+array_size, initB); + std::fill(exe_policy, c, c+array_size, initC); +} + +template +void STDStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + std::copy(exe_policy, a, a+array_size, h_a.data()); + std::copy(exe_policy, b, b+array_size, h_b.data()); + std::copy(exe_policy, c, c+array_size, h_c.data()); +} + +template +void STDStream::copy() +{ + // c[i] = a[i] + std::copy(exe_policy, a, a+array_size, c) ; +} + +template +void STDStream::mul() +{ + // b[i] = scalar * c[i]; + std::transform(exe_policy, c, c+array_size, b, [](T ci){ return startScalar*ci; }); +} + +template +void STDStream::add() +{ + // c[i] = a[i] + b[i]; + std::transform(exe_policy, a, a+array_size, b, c, std::plus()); +} + +template +void STDStream::triad() +{ + // a[i] = b[i] + scalar * c[i]; + std::transform(exe_policy, b, b+array_size, c, a, [](T bi, T ci){ return bi+startScalar*ci; }); +} + +template +T STDStream::dot() +{ + // sum = 0; sum += a[i]*b[i]; return sum; + return std::transform_reduce(exe_policy, a, a+array_size, b, 0.0); +} + +void listDevices(void) +{ + std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class STDStream; +template class STDStream; diff --git a/STDStream.h b/STDStream.h new file mode 100644 index 0000000..2088501 --- /dev/null +++ b/STDStream.h @@ -0,0 +1,38 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "Stream.h" + +#define IMPLEMENTATION_STRING "STD" + +template +class STDStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers + T *a; + T *b; + T *c; + + public: + STDStream(const unsigned int, T*, T*, T*, int); + ~STDStream() = default; + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; +}; diff --git a/main.cpp b/main.cpp index f006f8c..d1031cf 100644 --- a/main.cpp +++ b/main.cpp @@ -21,6 +21,8 @@ #if defined(CUDA) #include "CUDAStream.h" +#elif defined(STD) +#include "STDStream.h" #elif defined(HIP) #include "HIPStream.h" #elif defined(HC) @@ -162,6 +164,10 @@ void run() // Use the Kokkos implementation stream = new KokkosStream(ARRAY_SIZE, deviceIndex); +#elif defined(STD) + // Use the STD implementation + stream = new STDStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); @@ -363,6 +369,10 @@ void run_triad() // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); +#elif defined(STD) + // Use the STD implementation + stream = new STDStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); + #elif defined(SYCL) // Use the SYCL implementation stream = new SYCLStream(ARRAY_SIZE, deviceIndex);