diff --git a/ACCStream.cpp b/ACCStream.cpp index 450854f..0e591a8 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -36,13 +36,19 @@ ACCStream::~ACCStream() } template -void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void ACCStream::init_arrays(T initA, T initB, T initC) { - T *a = this->a; - T *b = this->b; - T *c = this->c; - #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template diff --git a/ACCStream.h b/ACCStream.h index 09559e2..8f13ed7 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -37,7 +37,7 @@ class ACCStream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/CUDAStream.cu b/CUDAStream.cu index 5d5a510..043c8c7 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -79,15 +79,22 @@ CUDAStream::~CUDAStream() check_error(); } -template -void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) + +template +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - // Copy host memory to device - cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + +template +void CUDAStream::init_arrays(T initA, T initB, T initC) +{ + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); - cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); - check_error(); - cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); check_error(); } diff --git a/CUDAStream.h b/CUDAStream.h index 8fcd6e5..0a0236b 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -46,7 +46,7 @@ class CUDAStream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/HIPStream.cu b/HIPStream.cu index 34ecfb6..8c02348 100644 --- a/HIPStream.cu +++ b/HIPStream.cu @@ -74,15 +74,21 @@ HIPStream::~HIPStream() check_error(); } -template -void HIPStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +template +__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC) { - // Copy host memory to device - hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice); + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + +template +void HIPStream::init_arrays(T initA, T initB, T initC) +{ + hipLaunchKernel(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); check_error(); - hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice); - check_error(); - hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice); + hipDeviceSynchronize(); check_error(); } diff --git a/HIPStream.h b/HIPStream.h index 9015e35..392080a 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -37,7 +37,7 @@ class HIPStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index b522028..9391a13 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -34,18 +34,18 @@ KOKKOSStream::~KOKKOSStream() } template -void KOKKOSStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void KOKKOSStream::init_arrays(T initA, T initB, T initC) { - for(int ii = 0; ii < array_size; ++ii) + View a(*d_a); + View b(*d_b); + View c(*d_c); + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - (*hm_a)(ii) = a[ii]; - (*hm_b)(ii) = b[ii]; - (*hm_c)(ii) = c[ii]; - } - deep_copy(*d_a, *hm_a); - deep_copy(*d_b, *hm_b); - deep_copy(*d_c, *hm_c); + a[index] = initA; + b[index] - initB; + c[index] = initC; + }); + Kokkos::fence(); } template diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp index 013deff..b230f18 100644 --- a/KOKKOSStream.hpp +++ b/KOKKOSStream.hpp @@ -49,8 +49,7 @@ class KOKKOSStream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OCLStream.cpp b/OCLStream.cpp index 8e2bf5e..7bc5a78 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -16,6 +16,18 @@ std::string kernels{R"CLC( constant TYPE scalar = startScalar; + kernel void init( + global TYPE * restrict a, + global TYPE * restrict b, + global TYPE * restrict c, + TYPE initA, TYPE initB, TYPE initC) + { + const size_t i = get_global_id(0); + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + kernel void copy( global const TYPE * restrict a, global TYPE * restrict c) @@ -140,6 +152,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) } // Create kernels + init_kernel = new cl::KernelFunctor(program, "init"); copy_kernel = new cl::KernelFunctor(program, "copy"); mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); @@ -168,6 +181,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) template OCLStream::~OCLStream() { + delete init_kernel; delete copy_kernel; delete mul_kernel; delete add_kernel; @@ -231,11 +245,13 @@ T OCLStream::dot() } template -void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void OCLStream::init_arrays(T initA, T initB, T initC) { - cl::copy(queue, a.begin(), a.end(), d_a); - cl::copy(queue, b.begin(), b.end(), d_b); - cl::copy(queue, c.begin(), c.end(), d_c); + (*init_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c, initA, initB, initC + ); + queue.finish(); } template diff --git a/OCLStream.h b/OCLStream.h index ab10a7b..fbeff9a 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -42,6 +42,7 @@ class OCLStream : public Stream cl::Context context; cl::CommandQueue queue; + cl::KernelFunctor *init_kernel; cl::KernelFunctor *copy_kernel; cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; @@ -63,7 +64,7 @@ class OCLStream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp index 046e145..6334b65 100644 --- a/OMP3Stream.cpp +++ b/OMP3Stream.cpp @@ -26,14 +26,14 @@ OMP3Stream::~OMP3Stream() template -void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void OMP3Stream::init_arrays(T initA, T initB, T initC) { #pragma omp parallel for for (int i = 0; i < array_size; i++) { - a[i] = h_a[i]; - b[i] = h_b[i]; - c[i] = h_c[i]; + a[i] = initA; + b[i] = initB; + c[i] = initC; } } diff --git a/OMP3Stream.h b/OMP3Stream.h index edad55e..b6ae1c9 100644 --- a/OMP3Stream.h +++ b/OMP3Stream.h @@ -35,7 +35,7 @@ class OMP3Stream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index c45885e..3ba2d40 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -35,13 +35,19 @@ OMP45Stream::~OMP45Stream() } template -void OMP45Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void OMP45Stream::init_arrays(T initA, T initB, T initC) { + unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template diff --git a/OMP45Stream.h b/OMP45Stream.h index a1febb7..e99fdeb 100644 --- a/OMP45Stream.h +++ b/OMP45Stream.h @@ -38,7 +38,7 @@ class OMP45Stream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/RAJAStream.cpp b/RAJAStream.cpp index a0329d6..240f160 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -21,12 +21,6 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = new T[ARRAY_SIZE]; d_b = new T[ARRAY_SIZE]; d_c = new T[ARRAY_SIZE]; - forall(index_set, [=] RAJA_DEVICE (int index) - { - d_a[index] = 0.0; - d_b[index] = 0.0; - d_c[index] = 0.0; - }); #else cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); @@ -50,12 +44,17 @@ RAJAStream::~RAJAStream() } template -void RAJAStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void RAJAStream::init_arrays(T initA, T initB, T initC) { - std::copy(a.begin(), a.end(), d_a); - std::copy(b.begin(), b.end(), d_b); - std::copy(c.begin(), c.end(), d_c); + T* a = d_a; + T* b = d_b; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + a[index] = initA; + b[index] = initB; + c[index] = initC; + }); } template diff --git a/RAJAStream.hpp b/RAJAStream.hpp index 768314a..b5cb586 100644 --- a/RAJAStream.hpp +++ b/RAJAStream.hpp @@ -53,8 +53,7 @@ class RAJAStream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 60a79a5..6d2cc3f 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -20,6 +20,7 @@ program * p; /* Forward declaration of SYCL kernels */ namespace kernels { + class init; class copy; class mul; class add; @@ -60,6 +61,7 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) /* Pre-build the kernels */ p = new program(queue->get_context()); + p->build_from_kernel_name(); p->build_from_kernel_name(); p->build_from_kernel_name(); p->build_from_kernel_name(); @@ -201,17 +203,23 @@ T SYCLStream::dot() } template -void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void SYCLStream::init_arrays(T initA, T initB, T initC) { - auto _a = d_a->template get_access(); - auto _b = d_b->template get_access(); - auto _c = d_c->template get_access(); - for (int i = 0; i < array_size; i++) + queue->submit([&](handler &cgh) { - _a[i] = a[i]; - _b[i] = b[i]; - _c[i] = c[i]; - } + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + { + auto id = item.get(); + ka[id[0]] = initA; + kb[id[0]] = initB; + kc[id[0]] = initC; + }); + }); + queue->wait(); } template diff --git a/SYCLStream.h b/SYCLStream.h index 6f7205b..f3c8d25 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -44,7 +44,7 @@ class SYCLStream : public Stream virtual void triad() override; virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/Stream.h b/Stream.h index e9d17b1..ff00a54 100644 --- a/Stream.h +++ b/Stream.h @@ -32,7 +32,7 @@ class Stream virtual T dot() = 0; // Copy memory between host and device - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; + virtual void init_arrays(T initA, T initB, T initC) = 0; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; }; diff --git a/main.cpp b/main.cpp index f717d88..16e3241 100644 --- a/main.cpp +++ b/main.cpp @@ -84,9 +84,9 @@ void run() std::cout << "Precision: double" << std::endl; // Create host vectors - std::vector a(ARRAY_SIZE, startA); - std::vector b(ARRAY_SIZE, startB); - std::vector c(ARRAY_SIZE, startC); + std::vector a(ARRAY_SIZE); + std::vector b(ARRAY_SIZE); + std::vector c(ARRAY_SIZE); std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" @@ -138,7 +138,7 @@ void run() #endif - stream->write_arrays(a, b, c); + stream->init_arrays(startA, startB, startC); // List of times std::vector> timings(5);