Merge remote-tracking branch 'origin/init-arrays' into devel

This commit is contained in:
Tom Deakin 2016-11-04 09:17:54 +00:00
commit d42bcd4675
20 changed files with 124 additions and 77 deletions

View File

@ -36,13 +36,19 @@ ACCStream<T>::~ACCStream()
} }
template <class T> template <class T>
void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c) void ACCStream<T>::init_arrays(T initA, T initB, T initC)
{ {
T *a = this->a; unsigned int array_size = this->array_size;
T *b = this->b; T * restrict a = this->a;
T *c = this->c; T * restrict b = this->b;
#pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) T * restrict c = this->c;
{} #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
} }
template <class T> template <class T>

View File

@ -37,7 +37,7 @@ class ACCStream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override; virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

View File

@ -79,15 +79,22 @@ CUDAStream<T>::~CUDAStream()
check_error(); check_error();
} }
template <class T>
void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) template <typename T>
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
{ {
// Copy host memory to device const int i = blockDim.x * blockIdx.x + threadIdx.x;
cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); a[i] = initA;
b[i] = initB;
c[i] = initC;
}
template <class T>
void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
{
init_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c, initA, initB, initC);
check_error(); check_error();
cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); cudaDeviceSynchronize();
check_error();
cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
check_error(); check_error();
} }

View File

@ -46,7 +46,7 @@ class CUDAStream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override; virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };

View File

@ -74,15 +74,21 @@ HIPStream<T>::~HIPStream()
check_error(); check_error();
} }
template <class T> template <typename T>
void HIPStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) __global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
{ {
// Copy host memory to device const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice); a[i] = initA;
b[i] = initB;
c[i] = initC;
}
template <class T>
void HIPStream<T>::init_arrays(T initA, T initB, T initC)
{
hipLaunchKernel(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
check_error(); check_error();
hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice); hipDeviceSynchronize();
check_error();
hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice);
check_error(); check_error();
} }

View File

@ -37,7 +37,7 @@ class HIPStream : public Stream<T>
virtual void mul() override; virtual void mul() override;
virtual void triad() override; virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override; virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };

View File

@ -34,18 +34,18 @@ KOKKOSStream<T>::~KOKKOSStream()
} }
template <class T> template <class T>
void KOKKOSStream<T>::write_arrays( void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{ {
for(int ii = 0; ii < array_size; ++ii) View<double*, DEVICE> a(*d_a);
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
{ {
(*hm_a)(ii) = a[ii]; a[index] = initA;
(*hm_b)(ii) = b[ii]; b[index] - initB;
(*hm_c)(ii) = c[ii]; c[index] = initC;
} });
deep_copy(*d_a, *hm_a); Kokkos::fence();
deep_copy(*d_b, *hm_b);
deep_copy(*d_c, *hm_c);
} }
template <class T> template <class T>

View File

@ -49,8 +49,7 @@ class KOKKOSStream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays( virtual void init_arrays(T initA, T initB, T initC) override;
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays( virtual void read_arrays(
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };

View File

@ -16,6 +16,18 @@ std::string kernels{R"CLC(
constant TYPE scalar = startScalar; constant TYPE scalar = startScalar;
kernel void init(
global TYPE * restrict a,
global TYPE * restrict b,
global TYPE * restrict c,
TYPE initA, TYPE initB, TYPE initC)
{
const size_t i = get_global_id(0);
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
kernel void copy( kernel void copy(
global const TYPE * restrict a, global const TYPE * restrict a,
global TYPE * restrict c) global TYPE * restrict c)
@ -140,6 +152,7 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
} }
// Create kernels // Create kernels
init_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T>(program, "init");
copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy"); copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul"); mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add"); add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
@ -168,6 +181,7 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
template <class T> template <class T>
OCLStream<T>::~OCLStream() OCLStream<T>::~OCLStream()
{ {
delete init_kernel;
delete copy_kernel; delete copy_kernel;
delete mul_kernel; delete mul_kernel;
delete add_kernel; delete add_kernel;
@ -231,11 +245,13 @@ T OCLStream<T>::dot()
} }
template <class T> template <class T>
void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) void OCLStream<T>::init_arrays(T initA, T initB, T initC)
{ {
cl::copy(queue, a.begin(), a.end(), d_a); (*init_kernel)(
cl::copy(queue, b.begin(), b.end(), d_b); cl::EnqueueArgs(queue, cl::NDRange(array_size)),
cl::copy(queue, c.begin(), c.end(), d_c); d_a, d_b, d_c, initA, initB, initC
);
queue.finish();
} }
template <class T> template <class T>

View File

@ -42,6 +42,7 @@ class OCLStream : public Stream<T>
cl::Context context; cl::Context context;
cl::CommandQueue queue; cl::CommandQueue queue;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T> *init_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel; cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel; cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel; cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
@ -63,7 +64,7 @@ class OCLStream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override; virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };

View File

@ -26,14 +26,14 @@ OMP3Stream<T>::~OMP3Stream()
template <class T> template <class T>
void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c) void OMP3Stream<T>::init_arrays(T initA, T initB, T initC)
{ {
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < array_size; i++) for (int i = 0; i < array_size; i++)
{ {
a[i] = h_a[i]; a[i] = initA;
b[i] = h_b[i]; b[i] = initB;
c[i] = h_c[i]; c[i] = initC;
} }
} }

View File

@ -35,7 +35,7 @@ class OMP3Stream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override; virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };

View File

@ -35,13 +35,19 @@ OMP45Stream<T>::~OMP45Stream()
} }
template <class T> template <class T>
void OMP45Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c) void OMP45Stream<T>::init_arrays(T initA, T initB, T initC)
{ {
unsigned int array_size = this->array_size;
T *a = this->a; T *a = this->a;
T *b = this->b; T *b = this->b;
T *c = this->c; T *c = this->c;
#pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size]) #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
{} for (int i = 0; i < array_size; i++)
{
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
} }
template <class T> template <class T>

View File

@ -38,7 +38,7 @@ class OMP45Stream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override; virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

View File

@ -21,12 +21,6 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
d_a = new T[ARRAY_SIZE]; d_a = new T[ARRAY_SIZE];
d_b = new T[ARRAY_SIZE]; d_b = new T[ARRAY_SIZE];
d_c = new T[ARRAY_SIZE]; d_c = new T[ARRAY_SIZE];
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
{
d_a[index] = 0.0;
d_b[index] = 0.0;
d_c[index] = 0.0;
});
#else #else
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@ -50,12 +44,17 @@ RAJAStream<T>::~RAJAStream()
} }
template <class T> template <class T>
void RAJAStream<T>::write_arrays( void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{ {
std::copy(a.begin(), a.end(), d_a); T* a = d_a;
std::copy(b.begin(), b.end(), d_b); T* b = d_b;
std::copy(c.begin(), c.end(), d_c); T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
{
a[index] = initA;
b[index] = initB;
c[index] = initC;
});
} }
template <class T> template <class T>

View File

@ -53,8 +53,7 @@ class RAJAStream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays( virtual void init_arrays(T initA, T initB, T initC) override;
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays( virtual void read_arrays(
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };

View File

@ -20,6 +20,7 @@ program * p;
/* Forward declaration of SYCL kernels */ /* Forward declaration of SYCL kernels */
namespace kernels { namespace kernels {
class init;
class copy; class copy;
class mul; class mul;
class add; class add;
@ -60,6 +61,7 @@ SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
/* Pre-build the kernels */ /* Pre-build the kernels */
p = new program(queue->get_context()); p = new program(queue->get_context());
p->build_from_kernel_name<kernels::init>();
p->build_from_kernel_name<kernels::copy>(); p->build_from_kernel_name<kernels::copy>();
p->build_from_kernel_name<kernels::mul>(); p->build_from_kernel_name<kernels::mul>();
p->build_from_kernel_name<kernels::add>(); p->build_from_kernel_name<kernels::add>();
@ -201,17 +203,23 @@ T SYCLStream<T>::dot()
} }
template <class T> template <class T>
void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
{ {
auto _a = d_a->template get_access<access::mode::write, access::target::host_buffer>(); queue->submit([&](handler &cgh)
auto _b = d_b->template get_access<access::mode::write, access::target::host_buffer>();
auto _c = d_c->template get_access<access::mode::write, access::target::host_buffer>();
for (int i = 0; i < array_size; i++)
{ {
_a[i] = a[i]; auto ka = d_a->template get_access<access::mode::write>(cgh);
_b[i] = b[i]; auto kb = d_b->template get_access<access::mode::write>(cgh);
_c[i] = c[i]; auto kc = d_c->template get_access<access::mode::write>(cgh);
} cgh.parallel_for<kernels::init>(p->get_kernel<kernels::init>(),
range<1>{array_size}, [=](item<1> item)
{
auto id = item.get();
ka[id[0]] = initA;
kb[id[0]] = initB;
kc[id[0]] = initC;
});
});
queue->wait();
} }
template <class T> template <class T>

View File

@ -44,7 +44,7 @@ class SYCLStream : public Stream<T>
virtual void triad() override; virtual void triad() override;
virtual T dot() override; virtual T dot() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override; virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };

View File

@ -32,7 +32,7 @@ class Stream
virtual T dot() = 0; virtual T dot() = 0;
// Copy memory between host and device // Copy memory between host and device
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0; virtual void init_arrays(T initA, T initB, T initC) = 0;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
}; };

View File

@ -84,9 +84,9 @@ void run()
std::cout << "Precision: double" << std::endl; std::cout << "Precision: double" << std::endl;
// Create host vectors // Create host vectors
std::vector<T> a(ARRAY_SIZE, startA); std::vector<T> a(ARRAY_SIZE);
std::vector<T> b(ARRAY_SIZE, startB); std::vector<T> b(ARRAY_SIZE);
std::vector<T> c(ARRAY_SIZE, startC); std::vector<T> c(ARRAY_SIZE);
std::streamsize ss = std::cout.precision(); std::streamsize ss = std::cout.precision();
std::cout << std::setprecision(1) << std::fixed std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
@ -138,7 +138,7 @@ void run()
#endif #endif
stream->write_arrays(a, b, c); stream->init_arrays(startA, startB, startC);
// List of times // List of times
std::vector<std::vector<double>> timings(5); std::vector<std::vector<double>> timings(5);