diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp new file mode 100644 index 0000000..d93b6d7 --- /dev/null +++ b/KOKKOSStream.cpp @@ -0,0 +1,145 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include "KOKKOSStream.hpp" + +using namespace Kokkos; + +template +KOKKOSStream::KOKKOSStream( + const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + Kokkos::initialize(); + + d_a = new View("d_a", ARRAY_SIZE); + d_b = new View("d_b", ARRAY_SIZE); + d_c = new View("d_c", ARRAY_SIZE); + hm_a = new View::HostMirror(); + hm_b = new View::HostMirror(); + hm_c = new View::HostMirror(); + *hm_a = create_mirror_view(*d_a); + *hm_b = create_mirror_view(*d_b); + *hm_c = create_mirror_view(*d_c); +} + +template +KOKKOSStream::~KOKKOSStream() +{ + finalize(); +} + +template +void KOKKOSStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) +{ + for(int ii = 0; ii < array_size; ++ii) + { + (*hm_a)(ii) = a[ii]; + (*hm_b)(ii) = b[ii]; + (*hm_c)(ii) = c[ii]; + } + deep_copy(*d_a, *hm_a); + deep_copy(*d_b, *hm_b); + deep_copy(*d_c, *hm_c); +} + +template +void KOKKOSStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) +{ + deep_copy(*hm_a, *d_a); + deep_copy(*hm_b, *d_b); + deep_copy(*hm_c, *d_c); + for(int ii = 0; ii < array_size; ++ii) + { + a[ii] = (*hm_a)(ii); + b[ii] = (*hm_b)(ii); + c[ii] = (*hm_c)(ii); + } +} + +template +void KOKKOSStream::copy() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + c[index] = a[index]; + }); + cudaDeviceSynchronize(); +} + +template +void KOKKOSStream::mul() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + const T scalar = 3.0; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + b[index] = scalar*c[index]; + }); + cudaDeviceSynchronize(); +} + +template +void KOKKOSStream::add() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + c[index] = a[index] + b[index]; + }); + + cudaDeviceSynchronize(); +} + +template +void KOKKOSStream::triad() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + const T scalar = 3.0; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + a[index] = b[index] + scalar*c[index]; + }); + + cudaDeviceSynchronize(); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "Kokkos"; +} + + +std::string getDeviceDriver(const int device) +{ + return "Kokkos"; +} + +//template class KOKKOSStream; +template class KOKKOSStream; + diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp new file mode 100644 index 0000000..d2b9665 --- /dev/null +++ b/KOKKOSStream.hpp @@ -0,0 +1,56 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "KOKKOS" + +#ifdef KOKKOS_TARGET_CPU + #define DEVICE Kokkos::OpenMP +#else + #define DEVICE Kokkos::Cuda +#endif + +template +class KOKKOSStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers to arrays + Kokkos::View* d_a; + Kokkos::View* d_b; + Kokkos::View* d_c; + Kokkos::View::HostMirror* hm_a; + Kokkos::View::HostMirror* hm_b; + Kokkos::View::HostMirror* hm_c; + + public: + + KOKKOSStream(const unsigned int, const int); + ~KOKKOSStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + diff --git a/RAJAStream.cpp b/RAJAStream.cpp new file mode 100644 index 0000000..eb98d54 --- /dev/null +++ b/RAJAStream.cpp @@ -0,0 +1,131 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "RAJAStream.hpp" + +using RAJA::forall; +using RAJA::RangeSegment; + +template +RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + RangeSegment seg(0, ARRAY_SIZE); + index_set.push_back(seg); + +#ifdef RAJA_TARGET_CPU + d_a = new T[ARRAY_SIZE]; + d_b = new T[ARRAY_SIZE]; + d_c = new T[ARRAY_SIZE]; +#else + cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaDeviceSynchronize(); +#endif +} + +template +RAJAStream::~RAJAStream() +{ +#ifdef RAJA_TARGET_CPU + delete[] d_a; + delete[] d_b; + delete[] d_c; +#else + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); +#endif +} + +template +void RAJAStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) +{ + std::copy(a.begin(), a.end(), d_a); + std::copy(b.begin(), b.end(), d_b); + std::copy(c.begin(), c.end(), d_c); +} + +template +void RAJAStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) +{ + std::copy(d_a, d_a + array_size, a.data()); + std::copy(d_b, d_b + array_size, b.data()); + std::copy(d_c, d_c + array_size, c.data()); +} + +template +void RAJAStream::copy() +{ + T* a = d_a; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + c[index] = a[index]; + }); +} + +template +void RAJAStream::mul() +{ + T* b = d_b; + T* c = d_c; + const T scalar = 3.0; + forall(index_set, [=] RAJA_DEVICE (int index) + { + b[index] = scalar*c[index]; + }); +} + +template +void RAJAStream::add() +{ + T* a = d_a; + T* b = d_b; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + c[index] = a[index] + b[index]; + }); +} + +template +void RAJAStream::triad() +{ + T* a = d_a; + T* b = d_b; + T* c = d_c; + const T scalar = 3.0; + forall(index_set, [=] RAJA_DEVICE (int index) + { + a[index] = b[index] + scalar*c[index]; + }); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "RAJA"; +} + + +std::string getDeviceDriver(const int device) +{ + return "RAJA"; +} + +template class RAJAStream; +template class RAJAStream; + diff --git a/RAJAStream.hpp b/RAJAStream.hpp new file mode 100644 index 0000000..454e20e --- /dev/null +++ b/RAJAStream.hpp @@ -0,0 +1,58 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "RAJA/RAJA.hxx" + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "RAJA" + +#ifdef RAJA_TARGET_CPU +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::omp_parallel_for_exec> policy; +#else +const size_t block_size = 128; +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::cuda_exec> policy; +#endif + +template +class RAJAStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Contains iteration space + RAJA::IndexSet index_set; + + // Device side pointers to arrays + T* d_a; + T* d_b; + T* d_c; + + public: + + RAJAStream(const unsigned int, const int); + ~RAJAStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + diff --git a/main.cpp b/main.cpp index e0911d2..007ab7f 100644 --- a/main.cpp +++ b/main.cpp @@ -22,6 +22,10 @@ #include "CUDAStream.h" #elif defined(OCL) #include "OCLStream.h" +#elif defined(USE_RAJA) +#include "RAJAStream.hpp" +#elif defined(KOKKOS) +#include "KOKKOSStream.hpp" #elif defined(ACC) #include "ACCStream.h" #elif defined(SYCL) @@ -30,7 +34,6 @@ #include "OMP3Stream.h" #endif - unsigned int ARRAY_SIZE = 52428800; unsigned int num_times = 10; unsigned int deviceIndex = 0; @@ -56,9 +59,11 @@ int main(int argc, char *argv[]) // TODO: Fix SYCL to allow multiple template specializations #ifndef SYCL +#ifndef KOKKOS if (use_float) run(); else +#endif #endif run(); @@ -89,6 +94,14 @@ void run() // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); +#elif defined(USE_RAJA) + // Use the RAJA implementation + stream = new RAJAStream(ARRAY_SIZE, deviceIndex); + +#elif defined(KOKKOS) + // Use the Kokkos implementation + stream = new KOKKOSStream(ARRAY_SIZE, deviceIndex); + #elif defined(ACC) // Use the OpenACC implementation stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); @@ -101,7 +114,6 @@ void run() // Use the "reference" OpenMP 3 implementation stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); - #endif stream->write_arrays(a, b, c);