From 7c28a6386b1905e154815afcfc0f26e1458aaa3d Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Thu, 5 May 2016 17:22:29 +0100 Subject: [PATCH 1/7] Added the Kokkos and RAJA implementations --- main.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 96f4e5c..1a92ebc 100644 --- a/main.cpp +++ b/main.cpp @@ -22,9 +22,12 @@ #include "CUDAStream.h" #elif defined(OCL) #include "OCLStream.h" +#elif defined(RAJA) +#include "RAJAStream.hpp" +#elif defined(KOKKOS) +#include "KOKKOSStream.hpp" #endif - unsigned int ARRAY_SIZE = 52428800; unsigned int num_times = 10; unsigned int deviceIndex = 0; @@ -80,6 +83,14 @@ void run() // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); +#elif defined(RAJA) + // Use the RAJA implementation + stream = new RAJAStream(ARRAY_SIZE, deviceIndex); + +#elif defined(KOKKOS) + // Use the Kokkos implementation + stream = new KOKKOSStream(ARRAY_SIZE, deviceIndex); + #endif stream->write_arrays(a, b, c); From 45381da0b2cc473ad6b562394c422a3176b21f83 Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 10:46:35 +0100 Subject: [PATCH 2/7] Initial commit of in progress developments of RAJA and KOKKOS stream --- KOKKOSStream.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++ KOKKOSStream.hpp | 53 ++++++++++++++++++++ RAJAStream.cpp | 105 ++++++++++++++++++++++++++++++++++++++++ RAJAStream.hpp | 58 ++++++++++++++++++++++ 4 files changed, 339 insertions(+) create mode 100644 KOKKOSStream.cpp create mode 100644 KOKKOSStream.hpp create mode 100644 RAJAStream.cpp create mode 100644 RAJAStream.hpp diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp new file mode 100644 index 0000000..c4d548b --- /dev/null +++ b/KOKKOSStream.cpp @@ -0,0 +1,123 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include "KOKKOSStream.hpp" + +using Kokkos::parallel_for; + +template +KOKKOSStream::KOKKOSStream( + const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + Kokkos::initialize(); + + new(d_a) Kokkos::View("d_a", ARRAY_SIZE); + new(d_b) Kokkos::View("d_b", ARRAY_SIZE); + new(d_c) Kokkos::View("d_c", ARRAY_SIZE); + new(hm_a) Kokkos::View::HostMirror(); + new(hm_b) Kokkos::View::HostMirror(); + new(hm_c) Kokkos::View::HostMirror(); + hm_a = Kokkos::create_mirror_view(d_a); + hm_b = Kokkos::create_mirror_view(d_b); + hm_c = Kokkos::create_mirror_view(d_c); +} + +template +KOKKOSStream::~KOKKOSStream() +{ + Kokkos::finalize(); +} + +template +void KOKKOSStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) +{ + for(int ii = 0; ii < array_size; ++ii) + { + hm_a(ii) = a[ii]; + hm_b(ii) = b[ii]; + hm_c(ii) = c[ii]; + } + Kokkos::deep_copy(hm_a, d_a); + Kokkos::deep_copy(hm_b, d_b); + Kokkos::deep_copy(hm_c, d_c); +} + +template +void KOKKOSStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) +{ + Kokkos::deep_copy(d_a, hm_a); + Kokkos::deep_copy(d_a, hm_b); + Kokkos::deep_copy(d_a, hm_c); + for(int ii = 0; ii < array_size; ++ii) + { + a[ii] = hm_a(ii); + b[ii] = hm_b(ii); + c[ii] = hm_c(ii); + } +} + +template +void KOKKOSStream::copy() +{ + Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_c[index] = d_a[index]; + }); +} + +template +void KOKKOSStream::mul() +{ + const T scalar = 3.0; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_b[index] = scalar*d_c[index]; + }); +} + +template +void KOKKOSStream::add() +{ + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_c[index] = d_a[index] + d_b[index]; + }); +} + +template +void KOKKOSStream::triad() +{ + const T scalar = 3.0; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + d_a[index] = d_b[index] + scalar*d_c[index]; + }); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "Kokkos"; +} + + +std::string getDeviceDriver(const int device) +{ + return "Kokkos"; +} + +template class KOKKOSStream; +template class KOKKOSStream; + diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp new file mode 100644 index 0000000..632ca20 --- /dev/null +++ b/KOKKOSStream.hpp @@ -0,0 +1,53 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "KOKKOS" + +#define DEVICE Kokkos::OpenMP + + +template +class KOKKOSStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers to arrays + Kokkos::View d_a; + Kokkos::View d_b; + Kokkos::View d_c; + Kokkos::View::HostMirror hm_a; + Kokkos::View::HostMirror hm_b; + Kokkos::View::HostMirror hm_c; + + public: + + KOKKOSStream(const unsigned int, const int); + ~KOKKOSStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + diff --git a/RAJAStream.cpp b/RAJAStream.cpp new file mode 100644 index 0000000..5b1c980 --- /dev/null +++ b/RAJAStream.cpp @@ -0,0 +1,105 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "RAJAStream.hpp" + +using RAJA::forall; +using RAJA::RangeSegment; + +template +RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + RangeSegment seg(0, ARRAY_SIZE); + index_set.push_back(seg); + d_a = new T[ARRAY_SIZE]; + d_b = new T[ARRAY_SIZE]; + d_c = new T[ARRAY_SIZE]; +} + +template +RAJAStream::~RAJAStream() +{ + delete[] d_a; + delete[] d_b; + delete[] d_c; +} + +template +void RAJAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + std::copy(a.begin(), a.end(), d_a); + std::copy(b.begin(), b.end(), d_b); + std::copy(c.begin(), c.end(), d_c); +} + +template +void RAJAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + std::copy(d_a, d_a + array_size - 1, a.data()); + std::copy(d_b, d_b + array_size - 1, b.data()); + std::copy(d_c, d_c + array_size - 1, c.data()); +} + +template +void RAJAStream::copy() +{ + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_c[index] = d_a[index]; + }); +} + +template +void RAJAStream::mul() +{ + const T scalar = 3.0; + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_b[index] = scalar*d_c[index]; + }); +} + +template +void RAJAStream::add() +{ + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_c[index] = d_a[index] + d_b[index]; + }); +} + +template +void RAJAStream::triad() +{ + const T scalar = 3.0; + forall(index_set, [=] RAJA_DEVICE (int index) + { + d_a[index] = d_b[index] + scalar*d_c[index]; + }); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "RAJA"; +} + + +std::string getDeviceDriver(const int device) +{ + return "RAJA"; +} + +template class RAJAStream; +template class RAJAStream; + diff --git a/RAJAStream.hpp b/RAJAStream.hpp new file mode 100644 index 0000000..a41c60e --- /dev/null +++ b/RAJAStream.hpp @@ -0,0 +1,58 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "RAJA/RAJA.hxx" + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "RAJA" + +#ifdef RAJA_USE_CUDA +const size_t block_size = 128; +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::cuda_exec_async> policy; +#else +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::omp_parallel_for_exec> policy; +#endif + +template +class RAJAStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Contains iteration space + RAJA::IndexSet index_set; + + // Device side pointers to arrays + T* d_a; + T* d_b; + T* d_c; + + public: + + RAJAStream(const unsigned int, const int); + ~RAJAStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + From 3b266b826610abbc8dd3ea3f94bf076a30a758ef Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 10:51:35 +0100 Subject: [PATCH 3/7] Fix for namespace collision with #define RAJA --- main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.cpp b/main.cpp index 5d3a559..f67fdd6 100644 --- a/main.cpp +++ b/main.cpp @@ -22,7 +22,7 @@ #include "CUDAStream.h" #elif defined(OCL) #include "OCLStream.h" -#elif defined(RAJA) +#elif defined(USE_RAJA) #include "RAJAStream.hpp" #elif defined(KOKKOS) #include "KOKKOSStream.hpp" @@ -89,7 +89,7 @@ void run() // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); -#elif defined(RAJA) +#elif defined(USE_RAJA) // Use the RAJA implementation stream = new RAJAStream(ARRAY_SIZE, deviceIndex); From 1a60f130eba160091f5cd5915caeca9ed4e536f9 Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 13:17:04 +0100 Subject: [PATCH 4/7] Fixed memory management for GPU, now working with OpenMP and CUDA --- RAJAStream.cpp | 44 +++++++++++++++++++++++++++++++++++--------- RAJAStream.hpp | 12 ++++++------ 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 5b1c980..eb98d54 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -16,21 +16,36 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) { RangeSegment seg(0, ARRAY_SIZE); index_set.push_back(seg); + +#ifdef RAJA_TARGET_CPU d_a = new T[ARRAY_SIZE]; d_b = new T[ARRAY_SIZE]; d_c = new T[ARRAY_SIZE]; +#else + cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaDeviceSynchronize(); +#endif } template RAJAStream::~RAJAStream() { +#ifdef RAJA_TARGET_CPU delete[] d_a; delete[] d_b; delete[] d_c; +#else + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); +#endif } template -void RAJAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void RAJAStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) { std::copy(a.begin(), a.end(), d_a); std::copy(b.begin(), b.end(), d_b); @@ -38,48 +53,59 @@ void RAJAStream::write_arrays(const std::vector& a, const std::vector& } template -void RAJAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void RAJAStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) { - std::copy(d_a, d_a + array_size - 1, a.data()); - std::copy(d_b, d_b + array_size - 1, b.data()); - std::copy(d_c, d_c + array_size - 1, c.data()); + std::copy(d_a, d_a + array_size, a.data()); + std::copy(d_b, d_b + array_size, b.data()); + std::copy(d_c, d_c + array_size, c.data()); } template void RAJAStream::copy() { + T* a = d_a; + T* c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { - d_c[index] = d_a[index]; + c[index] = a[index]; }); } template void RAJAStream::mul() { + T* b = d_b; + T* c = d_c; const T scalar = 3.0; forall(index_set, [=] RAJA_DEVICE (int index) { - d_b[index] = scalar*d_c[index]; + b[index] = scalar*c[index]; }); } template void RAJAStream::add() { + T* a = d_a; + T* b = d_b; + T* c = d_c; forall(index_set, [=] RAJA_DEVICE (int index) { - d_c[index] = d_a[index] + d_b[index]; + c[index] = a[index] + b[index]; }); } template void RAJAStream::triad() { + T* a = d_a; + T* b = d_b; + T* c = d_c; const T scalar = 3.0; forall(index_set, [=] RAJA_DEVICE (int index) { - d_a[index] = d_b[index] + scalar*d_c[index]; + a[index] = b[index] + scalar*c[index]; }); } diff --git a/RAJAStream.hpp b/RAJAStream.hpp index a41c60e..454e20e 100644 --- a/RAJAStream.hpp +++ b/RAJAStream.hpp @@ -14,15 +14,15 @@ #define IMPLEMENTATION_STRING "RAJA" -#ifdef RAJA_USE_CUDA -const size_t block_size = 128; -typedef RAJA::IndexSet::ExecPolicy< - RAJA::seq_segit, - RAJA::cuda_exec_async> policy; -#else +#ifdef RAJA_TARGET_CPU typedef RAJA::IndexSet::ExecPolicy< RAJA::seq_segit, RAJA::omp_parallel_for_exec> policy; +#else +const size_t block_size = 128; +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::cuda_exec> policy; #endif template From 894829cb05143c908923a831e9f0d13072ef640a Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 21:02:44 +0100 Subject: [PATCH 5/7] Adjusted the Kokkos implementation to fix view initialisation, and store local copies of views for lambda scoping --- KOKKOSStream.cpp | 74 +++++++++++++++++++++++++++++------------------- KOKKOSStream.hpp | 19 +++++++------ main.cpp | 2 ++ 3 files changed, 58 insertions(+), 37 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index c4d548b..3834081 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -7,7 +7,7 @@ #include "KOKKOSStream.hpp" -using Kokkos::parallel_for; +using namespace Kokkos; template KOKKOSStream::KOKKOSStream( @@ -16,21 +16,21 @@ KOKKOSStream::KOKKOSStream( { Kokkos::initialize(); - new(d_a) Kokkos::View("d_a", ARRAY_SIZE); - new(d_b) Kokkos::View("d_b", ARRAY_SIZE); - new(d_c) Kokkos::View("d_c", ARRAY_SIZE); - new(hm_a) Kokkos::View::HostMirror(); - new(hm_b) Kokkos::View::HostMirror(); - new(hm_c) Kokkos::View::HostMirror(); - hm_a = Kokkos::create_mirror_view(d_a); - hm_b = Kokkos::create_mirror_view(d_b); - hm_c = Kokkos::create_mirror_view(d_c); + d_a = new View("d_a", ARRAY_SIZE); + d_b = new View("d_b", ARRAY_SIZE); + d_c = new View("d_c", ARRAY_SIZE); + hm_a = new View::HostMirror(); + hm_b = new View::HostMirror(); + hm_c = new View::HostMirror(); + *hm_a = create_mirror_view(*d_a); + *hm_b = create_mirror_view(*d_b); + *hm_c = create_mirror_view(*d_c); } template KOKKOSStream::~KOKKOSStream() { - Kokkos::finalize(); + finalize(); } template @@ -39,65 +39,81 @@ void KOKKOSStream::write_arrays( { for(int ii = 0; ii < array_size; ++ii) { - hm_a(ii) = a[ii]; - hm_b(ii) = b[ii]; - hm_c(ii) = c[ii]; + (*hm_a)(ii) = a[ii]; + (*hm_b)(ii) = b[ii]; + (*hm_c)(ii) = c[ii]; } - Kokkos::deep_copy(hm_a, d_a); - Kokkos::deep_copy(hm_b, d_b); - Kokkos::deep_copy(hm_c, d_c); + deep_copy(*hm_a, *d_a); + deep_copy(*hm_b, *d_b); + deep_copy(*hm_c, *d_c); } template void KOKKOSStream::read_arrays( std::vector& a, std::vector& b, std::vector& c) { - Kokkos::deep_copy(d_a, hm_a); - Kokkos::deep_copy(d_a, hm_b); - Kokkos::deep_copy(d_a, hm_c); + deep_copy(*d_a, *hm_a); + deep_copy(*d_b, *hm_b); + deep_copy(*d_c, *hm_c); for(int ii = 0; ii < array_size; ++ii) { - a[ii] = hm_a(ii); - b[ii] = hm_b(ii); - c[ii] = hm_c(ii); + a[ii] = (*hm_a)(ii); + b[ii] = (*hm_b)(ii); + c[ii] = (*hm_c)(ii); } } template void KOKKOSStream::copy() { - Kokkos::parallel_for(array_size, KOKKOS_LAMBDA (const int index) + View a(*d_a); + View b(*d_b); + View c(*d_c); + + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_c[index] = d_a[index]; + c[index] = a[index]; }); } template void KOKKOSStream::mul() { + View a(*d_a); + View b(*d_b); + View c(*d_c); + const T scalar = 3.0; parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_b[index] = scalar*d_c[index]; + b[index] = scalar*c[index]; }); } template void KOKKOSStream::add() { + View a(*d_a); + View b(*d_b); + View c(*d_c); + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_c[index] = d_a[index] + d_b[index]; + c[index] = a[index] + b[index]; }); } template void KOKKOSStream::triad() { + View a(*d_a); + View b(*d_b); + View c(*d_c); + const T scalar = 3.0; parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - d_a[index] = d_b[index] + scalar*d_c[index]; + a[index] = b[index] + scalar*c[index]; }); } @@ -118,6 +134,6 @@ std::string getDeviceDriver(const int device) return "Kokkos"; } -template class KOKKOSStream; +//template class KOKKOSStream; template class KOKKOSStream; diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp index 632ca20..d2b9665 100644 --- a/KOKKOSStream.hpp +++ b/KOKKOSStream.hpp @@ -17,8 +17,11 @@ #define IMPLEMENTATION_STRING "KOKKOS" -#define DEVICE Kokkos::OpenMP - +#ifdef KOKKOS_TARGET_CPU + #define DEVICE Kokkos::OpenMP +#else + #define DEVICE Kokkos::Cuda +#endif template class KOKKOSStream : public Stream @@ -28,12 +31,12 @@ class KOKKOSStream : public Stream unsigned int array_size; // Device side pointers to arrays - Kokkos::View d_a; - Kokkos::View d_b; - Kokkos::View d_c; - Kokkos::View::HostMirror hm_a; - Kokkos::View::HostMirror hm_b; - Kokkos::View::HostMirror hm_c; + Kokkos::View* d_a; + Kokkos::View* d_b; + Kokkos::View* d_c; + Kokkos::View::HostMirror* hm_a; + Kokkos::View::HostMirror* hm_b; + Kokkos::View::HostMirror* hm_c; public: diff --git a/main.cpp b/main.cpp index 4794f9b..007ab7f 100644 --- a/main.cpp +++ b/main.cpp @@ -59,9 +59,11 @@ int main(int argc, char *argv[]) // TODO: Fix SYCL to allow multiple template specializations #ifndef SYCL +#ifndef KOKKOS if (use_float) run(); else +#endif #endif run(); From 0f0454ec295b3c9fe2ada150534fb5d2e332ccce Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 21:05:20 +0100 Subject: [PATCH 6/7] Added CUDA device syncs to force proper timing --- KOKKOSStream.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 3834081..0c3f44c 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -74,6 +74,7 @@ void KOKKOSStream::copy() { c[index] = a[index]; }); + cudaDeviceSynchronize(); } template @@ -88,6 +89,7 @@ void KOKKOSStream::mul() { b[index] = scalar*c[index]; }); + cudaDeviceSynchronize(); } template @@ -101,6 +103,8 @@ void KOKKOSStream::add() { c[index] = a[index] + b[index]; }); + + cudaDeviceSynchronize(); } template @@ -115,6 +119,8 @@ void KOKKOSStream::triad() { a[index] = b[index] + scalar*c[index]; }); + + cudaDeviceSynchronize(); } void listDevices(void) From 6e9b85bb26221114c18f53606aed5ae6fd387c0a Mon Sep 17 00:00:00 2001 From: Matthew Martineau Date: Fri, 6 May 2016 21:08:23 +0100 Subject: [PATCH 7/7] Fixed deep copy ordering, which was reversed --- KOKKOSStream.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 0c3f44c..d93b6d7 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -43,18 +43,18 @@ void KOKKOSStream::write_arrays( (*hm_b)(ii) = b[ii]; (*hm_c)(ii) = c[ii]; } - deep_copy(*hm_a, *d_a); - deep_copy(*hm_b, *d_b); - deep_copy(*hm_c, *d_c); + deep_copy(*d_a, *hm_a); + deep_copy(*d_b, *hm_b); + deep_copy(*d_c, *hm_c); } template void KOKKOSStream::read_arrays( std::vector& a, std::vector& b, std::vector& c) { - deep_copy(*d_a, *hm_a); - deep_copy(*d_b, *hm_b); - deep_copy(*d_c, *hm_c); + deep_copy(*hm_a, *d_a); + deep_copy(*hm_b, *d_b); + deep_copy(*hm_c, *d_c); for(int ii = 0; ii < array_size; ++ii) { a[ii] = (*hm_a)(ii);