Merge remote-tracking branch 'origin/init-arrays' into devel

2016-11-04 09:17:54 +00:00 · 2016-11-04 09:17:54 +00:00 · d42bcd4675
commit d42bcd4675
parent 6f9512e5b5 7f4761ae52
20 changed files with 124 additions and 77 deletions
--- a/ACCStream.cpp
+++ b/ACCStream.cpp
@ -36,13 +36,19 @@ ACCStream<T>::~ACCStream()
 }

 template <class T>
-void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+void ACCStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  T *a = this->a;
-  T *b = this->b;
-  T *c = this->c;
-  #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
-  {}
+  unsigned int array_size = this->array_size;
+  T * restrict a = this->a;
+  T * restrict b = this->b;
+  T * restrict c = this->c;
+  #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;
+  }
 }

 template <class T>
--- a/ACCStream.h
+++ b/ACCStream.h
@ -37,7 +37,7 @@ class ACCStream : public Stream<T>
    virtual void triad() override;
    virtual T dot() override;

-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;


--- a/CUDAStream.cu
+++ b/CUDAStream.cu
@ -79,15 +79,22 @@ CUDAStream<T>::~CUDAStream()
  check_error();
 }

-template <class T>
-void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+
+template <typename T>
+__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
 {
-  // Copy host memory to device
-  cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice);
+  const int i = blockDim.x * blockIdx.x + threadIdx.x;
+  a[i] = initA;
+  b[i] = initB;
+  c[i] = initC;
+}
+
+template <class T>
+void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  init_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c, initA, initB, initC);
  check_error();
-  cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice);
-  check_error();
-  cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
  check_error();
 }

--- a/CUDAStream.h
+++ b/CUDAStream.h
@ -46,7 +46,7 @@ class CUDAStream : public Stream<T>
    virtual void triad() override;
    virtual T dot() override;

-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

 };
--- a/HIPStream.cu
+++ b/HIPStream.cu
@ -74,15 +74,21 @@ HIPStream<T>::~HIPStream()
  check_error();
 }

-template <class T>
-void HIPStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+template <typename T>
+__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
 {
-  // Copy host memory to device
-  hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice);
+  const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  a[i] = initA;
+  b[i] = initB;
+  c[i] = initC;
+}
+
+template <class T>
+void HIPStream<T>::init_arrays(T initA, T initB, T initC)
+{
+  hipLaunchKernel(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
  check_error();
-  hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice);
-  check_error();
-  hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice);
+  hipDeviceSynchronize();
  check_error();
 }

--- a/HIPStream.h
+++ b/HIPStream.h
@ -37,7 +37,7 @@ class HIPStream : public Stream<T>
    virtual void mul() override;
    virtual void triad() override;

-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

 };
--- a/KOKKOSStream.cpp
+++ b/KOKKOSStream.cpp
@ -34,18 +34,18 @@ KOKKOSStream<T>::~KOKKOSStream()
 }

 template <class T>
-void KOKKOSStream<T>::write_arrays(
-        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  for(int ii = 0; ii < array_size; ++ii)
+  View<double*, DEVICE> a(*d_a);
+  View<double*, DEVICE> b(*d_b);
+  View<double*, DEVICE> c(*d_c);
+  parallel_for(array_size, KOKKOS_LAMBDA (const int index)
  {
-    (*hm_a)(ii) = a[ii];
-    (*hm_b)(ii) = b[ii];
-    (*hm_c)(ii) = c[ii];
-  }
-  deep_copy(*d_a, *hm_a);
-  deep_copy(*d_b, *hm_b);
-  deep_copy(*d_c, *hm_c);
+    a[index] = initA;
+    b[index] - initB;
+    c[index] = initC;
+  });
+  Kokkos::fence();
 }

 template <class T>
--- a/KOKKOSStream.hpp
+++ b/KOKKOSStream.hpp
@ -49,8 +49,7 @@ class KOKKOSStream : public Stream<T>
    virtual void triad() override;
    virtual T dot() override;

-    virtual void write_arrays(
-            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(
            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/OCLStream.cpp
+++ b/OCLStream.cpp
@ -16,6 +16,18 @@ std::string kernels{R"CLC(

  constant TYPE scalar = startScalar;

+  kernel void init(
+    global TYPE * restrict a,
+    global TYPE * restrict b,
+    global TYPE * restrict c,
+    TYPE initA, TYPE initB, TYPE initC)
+  {
+    const size_t i = get_global_id(0);
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;
+  }
+
  kernel void copy(
    global const TYPE * restrict a,
    global TYPE * restrict c)
@ -140,6 +152,7 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
  }

  // Create kernels
+  init_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T>(program, "init");
  copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
  mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
  add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
@ -168,6 +181,7 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
 template <class T>
 OCLStream<T>::~OCLStream()
 {
+  delete init_kernel;
  delete copy_kernel;
  delete mul_kernel;
  delete add_kernel;
@ -231,11 +245,13 @@ T OCLStream<T>::dot()
 }

 template <class T>
-void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+void OCLStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  cl::copy(queue, a.begin(), a.end(), d_a);
-  cl::copy(queue, b.begin(), b.end(), d_b);
-  cl::copy(queue, c.begin(), c.end(), d_c);
+  (*init_kernel)(
+    cl::EnqueueArgs(queue, cl::NDRange(array_size)),
+    d_a, d_b, d_c, initA, initB, initC
+  );
+  queue.finish();
 }

 template <class T>
--- a/OCLStream.h
+++ b/OCLStream.h
@ -42,6 +42,7 @@ class OCLStream : public Stream<T>
    cl::Context context;
    cl::CommandQueue queue;

+    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T> *init_kernel;
    cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
    cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
    cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
@ -63,7 +64,7 @@ class OCLStream : public Stream<T>
    virtual void triad() override;
    virtual T dot() override;

-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

 };
--- a/OMP3Stream.cpp
+++ b/OMP3Stream.cpp
@ -26,14 +26,14 @@ OMP3Stream<T>::~OMP3Stream()


 template <class T>
-void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+void OMP3Stream<T>::init_arrays(T initA, T initB, T initC)
 {
  #pragma omp parallel for
  for (int i = 0; i < array_size; i++)
  {
-    a[i] = h_a[i];
-    b[i] = h_b[i];
-    c[i] = h_c[i];
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;
  }
 }

--- a/OMP3Stream.h
+++ b/OMP3Stream.h
@ -35,7 +35,7 @@ class OMP3Stream : public Stream<T>
    virtual void triad() override;
    virtual T dot() override;

-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

 };
--- a/OMP45Stream.cpp
+++ b/OMP45Stream.cpp
@ -35,13 +35,19 @@ OMP45Stream<T>::~OMP45Stream()
 }

 template <class T>
-void OMP45Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
+void OMP45Stream<T>::init_arrays(T initA, T initB, T initC)
 {
+  unsigned int array_size = this->array_size;
  T *a = this->a;
  T *b = this->b;
  T *c = this->c;
-  #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size])
-  {}
+  #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;
+  }
 }

 template <class T>
--- a/OMP45Stream.h
+++ b/OMP45Stream.h
@ -38,7 +38,7 @@ class OMP45Stream : public Stream<T>
    virtual void triad() override;
    virtual T dot() override;

-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;


--- a/RAJAStream.cpp
+++ b/RAJAStream.cpp
@ -21,12 +21,6 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
  d_a = new T[ARRAY_SIZE];
  d_b = new T[ARRAY_SIZE];
  d_c = new T[ARRAY_SIZE];
-  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
-  {
-    d_a[index] = 0.0;
-    d_b[index] = 0.0;
-    d_c[index] = 0.0;
-  });
 #else
  cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
  cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@ -50,12 +44,17 @@ RAJAStream<T>::~RAJAStream()
 }

 template <class T>
-void RAJAStream<T>::write_arrays(
-        const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  std::copy(a.begin(), a.end(), d_a);
-  std::copy(b.begin(), b.end(), d_b);
-  std::copy(c.begin(), c.end(), d_c);
+  T* a = d_a;
+  T* b = d_b;
+  T* c = d_c;
+  forall<policy>(index_set, [=] RAJA_DEVICE (int index)
+  {
+    a[index] = initA;
+    b[index] = initB;
+    c[index] = initC;
+  });
 }

 template <class T>
--- a/RAJAStream.hpp
+++ b/RAJAStream.hpp
@ -53,8 +53,7 @@ class RAJAStream : public Stream<T>
    virtual void triad() override;
    virtual T dot() override;

-    virtual void write_arrays(
-            const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(
            std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
 };
--- a/SYCLStream.cpp
+++ b/SYCLStream.cpp
@ -20,6 +20,7 @@ program * p;

 /* Forward declaration of SYCL kernels */
 namespace kernels {
+  class init;
  class copy;
  class mul;
  class add;
@ -60,6 +61,7 @@ SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)

  /* Pre-build the kernels */
  p = new program(queue->get_context());
+  p->build_from_kernel_name<kernels::init>();
  p->build_from_kernel_name<kernels::copy>();
  p->build_from_kernel_name<kernels::mul>();
  p->build_from_kernel_name<kernels::add>();
@ -201,17 +203,23 @@ T SYCLStream<T>::dot()
 }

 template <class T>
-void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
+void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 {
-  auto _a = d_a->template get_access<access::mode::write, access::target::host_buffer>();
-  auto _b = d_b->template get_access<access::mode::write, access::target::host_buffer>();
-  auto _c = d_c->template get_access<access::mode::write, access::target::host_buffer>();
-  for (int i = 0; i < array_size; i++)
+  queue->submit([&](handler &cgh)
  {
-    _a[i] = a[i];
-    _b[i] = b[i];
-    _c[i] = c[i];
-  }
+    auto ka = d_a->template get_access<access::mode::write>(cgh);
+    auto kb = d_b->template get_access<access::mode::write>(cgh);
+    auto kc = d_c->template get_access<access::mode::write>(cgh);
+    cgh.parallel_for<kernels::init>(p->get_kernel<kernels::init>(),
+          range<1>{array_size}, [=](item<1> item)
+    {
+      auto id = item.get();
+      ka[id[0]] = initA;
+      kb[id[0]] = initB;
+      kc[id[0]] = initC;
+    });
+  });
+  queue->wait();
 }

 template <class T>
--- a/SYCLStream.h
+++ b/SYCLStream.h
@ -44,7 +44,7 @@ class SYCLStream : public Stream<T>
    virtual void triad() override;
    virtual T    dot() override;

-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
+    virtual void init_arrays(T initA, T initB, T initC) override;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

 };
--- a/Stream.h
+++ b/Stream.h
@ -32,7 +32,7 @@ class Stream
    virtual T dot() = 0;

    // Copy memory between host and device
-    virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0;
+    virtual void init_arrays(T initA, T initB, T initC) = 0;
    virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;

 };
--- a/main.cpp
+++ b/main.cpp
@ -84,9 +84,9 @@ void run()
    std::cout << "Precision: double" << std::endl;

  // Create host vectors
-  std::vector<T> a(ARRAY_SIZE, startA);
-  std::vector<T> b(ARRAY_SIZE, startB);
-  std::vector<T> c(ARRAY_SIZE, startC);
+  std::vector<T> a(ARRAY_SIZE);
+  std::vector<T> b(ARRAY_SIZE);
+  std::vector<T> c(ARRAY_SIZE);
  std::streamsize ss = std::cout.precision();
  std::cout << std::setprecision(1) << std::fixed
    << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
@ -138,7 +138,7 @@ void run()

 #endif

-  stream->write_arrays(a, b, c);
+  stream->init_arrays(startA, startB, startC);

  // List of times
  std::vector<std::vector<double>> timings(5);