Merge remote-tracking branch 'origin/init-arrays' into devel
This commit is contained in:
commit
d42bcd4675
@ -36,13 +36,19 @@ ACCStream<T>::~ACCStream()
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
|
||||
void ACCStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
T *a = this->a;
|
||||
T *b = this->b;
|
||||
T *c = this->c;
|
||||
#pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
|
||||
{}
|
||||
unsigned int array_size = this->array_size;
|
||||
T * restrict a = this->a;
|
||||
T * restrict b = this->b;
|
||||
T * restrict c = this->c;
|
||||
#pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
|
||||
for (int i = 0; i < array_size; i++)
|
||||
{
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
||||
@ -37,7 +37,7 @@ class ACCStream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
|
||||
|
||||
@ -79,15 +79,22 @@ CUDAStream<T>::~CUDAStream()
|
||||
check_error();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
|
||||
|
||||
template <typename T>
|
||||
__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC)
|
||||
{
|
||||
// Copy host memory to device
|
||||
cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice);
|
||||
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
init_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c, initA, initB, initC);
|
||||
check_error();
|
||||
cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice);
|
||||
check_error();
|
||||
cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
|
||||
cudaDeviceSynchronize();
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ class CUDAStream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
};
|
||||
|
||||
20
HIPStream.cu
20
HIPStream.cu
@ -74,15 +74,21 @@ HIPStream<T>::~HIPStream()
|
||||
check_error();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
|
||||
template <typename T>
|
||||
__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
|
||||
{
|
||||
// Copy host memory to device
|
||||
hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice);
|
||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void HIPStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC);
|
||||
check_error();
|
||||
hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice);
|
||||
check_error();
|
||||
hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice);
|
||||
hipDeviceSynchronize();
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ class HIPStream : public Stream<T>
|
||||
virtual void mul() override;
|
||||
virtual void triad() override;
|
||||
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
};
|
||||
|
||||
@ -34,18 +34,18 @@ KOKKOSStream<T>::~KOKKOSStream()
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void KOKKOSStream<T>::write_arrays(
|
||||
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
|
||||
void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
for(int ii = 0; ii < array_size; ++ii)
|
||||
View<double*, DEVICE> a(*d_a);
|
||||
View<double*, DEVICE> b(*d_b);
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
{
|
||||
(*hm_a)(ii) = a[ii];
|
||||
(*hm_b)(ii) = b[ii];
|
||||
(*hm_c)(ii) = c[ii];
|
||||
}
|
||||
deep_copy(*d_a, *hm_a);
|
||||
deep_copy(*d_b, *hm_b);
|
||||
deep_copy(*d_c, *hm_c);
|
||||
a[index] = initA;
|
||||
b[index] - initB;
|
||||
c[index] = initC;
|
||||
});
|
||||
Kokkos::fence();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
||||
@ -49,8 +49,7 @@ class KOKKOSStream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(
|
||||
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(
|
||||
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
};
|
||||
|
||||
@ -16,6 +16,18 @@ std::string kernels{R"CLC(
|
||||
|
||||
constant TYPE scalar = startScalar;
|
||||
|
||||
kernel void init(
|
||||
global TYPE * restrict a,
|
||||
global TYPE * restrict b,
|
||||
global TYPE * restrict c,
|
||||
TYPE initA, TYPE initB, TYPE initC)
|
||||
{
|
||||
const size_t i = get_global_id(0);
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
|
||||
kernel void copy(
|
||||
global const TYPE * restrict a,
|
||||
global TYPE * restrict c)
|
||||
@ -140,6 +152,7 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
}
|
||||
|
||||
// Create kernels
|
||||
init_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T>(program, "init");
|
||||
copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
|
||||
mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
|
||||
add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
|
||||
@ -168,6 +181,7 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
template <class T>
|
||||
OCLStream<T>::~OCLStream()
|
||||
{
|
||||
delete init_kernel;
|
||||
delete copy_kernel;
|
||||
delete mul_kernel;
|
||||
delete add_kernel;
|
||||
@ -231,11 +245,13 @@ T OCLStream<T>::dot()
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
|
||||
void OCLStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
cl::copy(queue, a.begin(), a.end(), d_a);
|
||||
cl::copy(queue, b.begin(), b.end(), d_b);
|
||||
cl::copy(queue, c.begin(), c.end(), d_c);
|
||||
(*init_kernel)(
|
||||
cl::EnqueueArgs(queue, cl::NDRange(array_size)),
|
||||
d_a, d_b, d_c, initA, initB, initC
|
||||
);
|
||||
queue.finish();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
||||
@ -42,6 +42,7 @@ class OCLStream : public Stream<T>
|
||||
cl::Context context;
|
||||
cl::CommandQueue queue;
|
||||
|
||||
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, T, T, T> *init_kernel;
|
||||
cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
|
||||
cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
|
||||
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
|
||||
@ -63,7 +64,7 @@ class OCLStream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
};
|
||||
|
||||
@ -26,14 +26,14 @@ OMP3Stream<T>::~OMP3Stream()
|
||||
|
||||
|
||||
template <class T>
|
||||
void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
|
||||
void OMP3Stream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < array_size; i++)
|
||||
{
|
||||
a[i] = h_a[i];
|
||||
b[i] = h_b[i];
|
||||
c[i] = h_c[i];
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -35,7 +35,7 @@ class OMP3Stream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
};
|
||||
|
||||
@ -35,13 +35,19 @@ OMP45Stream<T>::~OMP45Stream()
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void OMP45Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
|
||||
void OMP45Stream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
unsigned int array_size = this->array_size;
|
||||
T *a = this->a;
|
||||
T *b = this->b;
|
||||
T *c = this->c;
|
||||
#pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size])
|
||||
{}
|
||||
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
|
||||
for (int i = 0; i < array_size; i++)
|
||||
{
|
||||
a[i] = initA;
|
||||
b[i] = initB;
|
||||
c[i] = initC;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
||||
@ -38,7 +38,7 @@ class OMP45Stream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
|
||||
|
||||
@ -21,12 +21,6 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
d_a = new T[ARRAY_SIZE];
|
||||
d_b = new T[ARRAY_SIZE];
|
||||
d_c = new T[ARRAY_SIZE];
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
{
|
||||
d_a[index] = 0.0;
|
||||
d_b[index] = 0.0;
|
||||
d_c[index] = 0.0;
|
||||
});
|
||||
#else
|
||||
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||
@ -50,12 +44,17 @@ RAJAStream<T>::~RAJAStream()
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void RAJAStream<T>::write_arrays(
|
||||
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
|
||||
void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
std::copy(a.begin(), a.end(), d_a);
|
||||
std::copy(b.begin(), b.end(), d_b);
|
||||
std::copy(c.begin(), c.end(), d_c);
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
{
|
||||
a[index] = initA;
|
||||
b[index] = initB;
|
||||
c[index] = initC;
|
||||
});
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
||||
@ -53,8 +53,7 @@ class RAJAStream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(
|
||||
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(
|
||||
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
};
|
||||
|
||||
@ -20,6 +20,7 @@ program * p;
|
||||
|
||||
/* Forward declaration of SYCL kernels */
|
||||
namespace kernels {
|
||||
class init;
|
||||
class copy;
|
||||
class mul;
|
||||
class add;
|
||||
@ -60,6 +61,7 @@ SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
|
||||
/* Pre-build the kernels */
|
||||
p = new program(queue->get_context());
|
||||
p->build_from_kernel_name<kernels::init>();
|
||||
p->build_from_kernel_name<kernels::copy>();
|
||||
p->build_from_kernel_name<kernels::mul>();
|
||||
p->build_from_kernel_name<kernels::add>();
|
||||
@ -201,17 +203,23 @@ T SYCLStream<T>::dot()
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
|
||||
void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
auto _a = d_a->template get_access<access::mode::write, access::target::host_buffer>();
|
||||
auto _b = d_b->template get_access<access::mode::write, access::target::host_buffer>();
|
||||
auto _c = d_c->template get_access<access::mode::write, access::target::host_buffer>();
|
||||
for (int i = 0; i < array_size; i++)
|
||||
queue->submit([&](handler &cgh)
|
||||
{
|
||||
_a[i] = a[i];
|
||||
_b[i] = b[i];
|
||||
_c[i] = c[i];
|
||||
}
|
||||
auto ka = d_a->template get_access<access::mode::write>(cgh);
|
||||
auto kb = d_b->template get_access<access::mode::write>(cgh);
|
||||
auto kc = d_c->template get_access<access::mode::write>(cgh);
|
||||
cgh.parallel_for<kernels::init>(p->get_kernel<kernels::init>(),
|
||||
range<1>{array_size}, [=](item<1> item)
|
||||
{
|
||||
auto id = item.get();
|
||||
ka[id[0]] = initA;
|
||||
kb[id[0]] = initB;
|
||||
kc[id[0]] = initC;
|
||||
});
|
||||
});
|
||||
queue->wait();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
|
||||
@ -44,7 +44,7 @@ class SYCLStream : public Stream<T>
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
};
|
||||
|
||||
2
Stream.h
2
Stream.h
@ -32,7 +32,7 @@ class Stream
|
||||
virtual T dot() = 0;
|
||||
|
||||
// Copy memory between host and device
|
||||
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0;
|
||||
virtual void init_arrays(T initA, T initB, T initC) = 0;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
|
||||
|
||||
};
|
||||
|
||||
8
main.cpp
8
main.cpp
@ -84,9 +84,9 @@ void run()
|
||||
std::cout << "Precision: double" << std::endl;
|
||||
|
||||
// Create host vectors
|
||||
std::vector<T> a(ARRAY_SIZE, startA);
|
||||
std::vector<T> b(ARRAY_SIZE, startB);
|
||||
std::vector<T> c(ARRAY_SIZE, startC);
|
||||
std::vector<T> a(ARRAY_SIZE);
|
||||
std::vector<T> b(ARRAY_SIZE);
|
||||
std::vector<T> c(ARRAY_SIZE);
|
||||
std::streamsize ss = std::cout.precision();
|
||||
std::cout << std::setprecision(1) << std::fixed
|
||||
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
|
||||
@ -138,7 +138,7 @@ void run()
|
||||
|
||||
#endif
|
||||
|
||||
stream->write_arrays(a, b, c);
|
||||
stream->init_arrays(startA, startB, startC);
|
||||
|
||||
// List of times
|
||||
std::vector<std::vector<double>> timings(5);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user