diff --git a/ACCStream.cpp b/ACCStream.cpp index bd49663..ccc942a 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -36,13 +36,19 @@ ACCStream::~ACCStream() } template -void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void ACCStream::init_arrays(T initA, T initB, T initC) { - T *a = this->a; - T *b = this->b; - T *c = this->c; - #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template diff --git a/ACCStream.h b/ACCStream.h index 48fea55..54f947b 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -36,7 +36,7 @@ class ACCStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/CUDAStream.cu b/CUDAStream.cu index ff2ec41..08026d9 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -74,15 +74,22 @@ CUDAStream::~CUDAStream() check_error(); } -template -void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) + +template +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) { - // Copy host memory to device - cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + +template +void CUDAStream::init_arrays(T initA, T initB, T initC) +{ + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); - cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); - check_error(); - cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); check_error(); } diff --git a/CUDAStream.h b/CUDAStream.h index 6904a86..912721e 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -37,7 +37,7 @@ class CUDAStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/HIPStream.cu b/HIPStream.cu index 34ecfb6..8c02348 100644 --- a/HIPStream.cu +++ b/HIPStream.cu @@ -74,15 +74,21 @@ HIPStream::~HIPStream() check_error(); } -template -void HIPStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +template +__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC) { - // Copy host memory to device - hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice); + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + +template +void HIPStream::init_arrays(T initA, T initB, T initC) +{ + hipLaunchKernel(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); check_error(); - hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice); - check_error(); - hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice); + hipDeviceSynchronize(); check_error(); } diff --git a/HIPStream.h b/HIPStream.h index 9015e35..392080a 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -37,7 +37,7 @@ class HIPStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 94ac7ee..72b1ee5 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -34,18 +34,18 @@ KOKKOSStream::~KOKKOSStream() } template -void KOKKOSStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void KOKKOSStream::init_arrays(T initA, T initB, T initC) { - for(int ii = 0; ii < array_size; ++ii) + View a(*d_a); + View b(*d_b); + View c(*d_c); + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - (*hm_a)(ii) = a[ii]; - (*hm_b)(ii) = b[ii]; - (*hm_c)(ii) = c[ii]; - } - deep_copy(*d_a, *hm_a); - deep_copy(*d_b, *hm_b); - deep_copy(*d_c, *hm_c); + a[index] = initA; + b[index] - initB; + c[index] = initC; + }); + Kokkos::fence(); } template diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp index d2b9665..ff7cfeb 100644 --- a/KOKKOSStream.hpp +++ b/KOKKOSStream.hpp @@ -48,8 +48,7 @@ class KOKKOSStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OCLStream.cpp b/OCLStream.cpp index 2a1e5ee..c7e09a8 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -16,6 +16,18 @@ std::string kernels{R"CLC( constant TYPE scalar = startScalar; + kernel void init( + global TYPE * restrict a, + global TYPE * restrict b, + global TYPE * restrict c, + TYPE initA, TYPE initB, TYPE initC) + { + const size_t i = get_global_id(0); + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + kernel void copy( global const TYPE * restrict a, global TYPE * restrict c) @@ -101,6 +113,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) } // Create kernels + init_kernel = new cl::KernelFunctor(program, "init"); copy_kernel = new cl::KernelFunctor(program, "copy"); mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); @@ -120,12 +133,12 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - } template OCLStream::~OCLStream() { + delete init_kernel; delete copy_kernel; delete mul_kernel; delete add_kernel; @@ -173,11 +186,13 @@ void OCLStream::triad() } template -void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void OCLStream::init_arrays(T initA, T initB, T initC) { - cl::copy(queue, a.begin(), a.end(), d_a); - cl::copy(queue, b.begin(), b.end(), d_b); - cl::copy(queue, c.begin(), c.end(), d_c); + (*init_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c, initA, initB, initC + ); + queue.finish(); } template diff --git a/OCLStream.h b/OCLStream.h index 54abaa3..845e144 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -38,6 +38,7 @@ class OCLStream : public Stream cl::Context context; cl::CommandQueue queue; + cl::KernelFunctor *init_kernel; cl::KernelFunctor *copy_kernel; cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; @@ -53,7 +54,7 @@ class OCLStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp index f578c7c..b5e1bc2 100644 --- a/OMP3Stream.cpp +++ b/OMP3Stream.cpp @@ -26,14 +26,14 @@ OMP3Stream::~OMP3Stream() template -void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void OMP3Stream::init_arrays(T initA, T initB, T initC) { #pragma omp parallel for for (int i = 0; i < array_size; i++) { - a[i] = h_a[i]; - b[i] = h_b[i]; - c[i] = h_c[i]; + a[i] = initA; + b[i] = initB; + c[i] = initC; } } diff --git a/OMP3Stream.h b/OMP3Stream.h index 0f14300..1dadc95 100644 --- a/OMP3Stream.h +++ b/OMP3Stream.h @@ -34,7 +34,7 @@ class OMP3Stream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index 8f684e2..1722662 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -35,13 +35,19 @@ OMP45Stream::~OMP45Stream() } template -void OMP45Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void OMP45Stream::init_arrays(T initA, T initB, T initC) { + unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template diff --git a/OMP45Stream.h b/OMP45Stream.h index bd812a1..d2a5aaf 100644 --- a/OMP45Stream.h +++ b/OMP45Stream.h @@ -37,7 +37,7 @@ class OMP45Stream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 33687a1..d872987 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -21,12 +21,6 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = new T[ARRAY_SIZE]; d_b = new T[ARRAY_SIZE]; d_c = new T[ARRAY_SIZE]; - forall(index_set, [=] RAJA_DEVICE (int index) - { - d_a[index] = 0.0; - d_b[index] = 0.0; - d_c[index] = 0.0; - }); #else cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); @@ -50,12 +44,17 @@ RAJAStream::~RAJAStream() } template -void RAJAStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void RAJAStream::init_arrays(T initA, T initB, T initC) { - std::copy(a.begin(), a.end(), d_a); - std::copy(b.begin(), b.end(), d_b); - std::copy(c.begin(), c.end(), d_c); + T* a = d_a; + T* b = d_b; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + a[index] = initA; + b[index] = initB; + c[index] = initC; + }); } template diff --git a/RAJAStream.hpp b/RAJAStream.hpp index 454e20e..8ffa5be 100644 --- a/RAJAStream.hpp +++ b/RAJAStream.hpp @@ -50,8 +50,7 @@ class RAJAStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 12e96b4..919a657 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -20,6 +20,7 @@ program * p; /* Forward declaration of SYCL kernels */ namespace kernels { + class init; class copy; class mul; class add; @@ -46,6 +47,7 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) /* Pre-build the kernels */ p = new program(queue->get_context()); + p->build_from_kernel_name(); p->build_from_kernel_name(); p->build_from_kernel_name(); p->build_from_kernel_name(); @@ -142,17 +144,23 @@ void SYCLStream::triad() } template -void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void SYCLStream::init_arrays(T initA, T initB, T initC) { - auto _a = d_a->template get_access(); - auto _b = d_b->template get_access(); - auto _c = d_c->template get_access(); - for (int i = 0; i < array_size; i++) + queue->submit([&](handler &cgh) { - _a[i] = a[i]; - _b[i] = b[i]; - _c[i] = c[i]; - } + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + { + auto id = item.get(); + ka[id[0]] = initA; + kb[id[0]] = initB; + kc[id[0]] = initC; + }); + }); + queue->wait(); } template diff --git a/SYCLStream.h b/SYCLStream.h index 8bc515d..4bd21d8 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -38,7 +38,7 @@ class SYCLStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/Stream.h b/Stream.h index 631e305..44b4d8b 100644 --- a/Stream.h +++ b/Stream.h @@ -31,7 +31,7 @@ class Stream virtual void triad() = 0; // Copy memory between host and device - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; + virtual void init_arrays(T initA, T initB, T initC) = 0; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; }; diff --git a/main.cpp b/main.cpp index 6a15aa7..cb6241f 100644 --- a/main.cpp +++ b/main.cpp @@ -84,9 +84,9 @@ void run() std::cout << "Precision: double" << std::endl; // Create host vectors - std::vector a(ARRAY_SIZE, startA); - std::vector b(ARRAY_SIZE, startB); - std::vector c(ARRAY_SIZE, startC); + std::vector a(ARRAY_SIZE); + std::vector b(ARRAY_SIZE); + std::vector c(ARRAY_SIZE); std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" @@ -135,7 +135,7 @@ void run() #endif - stream->write_arrays(a, b, c); + stream->init_arrays(startA, startB, startC); // List of times std::vector> timings(4);