diff --git a/OCLStream.cpp b/OCLStream.cpp index a1f3f19..928421f 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -54,11 +54,14 @@ std::string kernels{R"CLC( global const TYPE * restrict a, global const TYPE * restrict b, global TYPE * restrict sum, - local TYPE * restrict wg_sum) + local TYPE * restrict wg_sum, + int array_size) { - const size_t i = get_global_id(0); + size_t i = get_global_id(0); const size_t local_i = get_local_id(0); - wg_sum[local_i] = a[i] * b[i]; + wg_sum[local_i] = 0.0; + for (; i < array_size; i += get_global_size(0)) + wg_sum[local_i] += a[i] * b[i]; for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) { @@ -128,7 +131,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); - dot_kernel = new cl::KernelFunctor(program, "stream_dot"); + dot_kernel = new cl::KernelFunctor(program, "stream_dot"); array_size = ARRAY_SIZE; @@ -144,9 +147,9 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * (ARRAY_SIZE/WGSIZE)); + d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * DOT_NUM_GROUPS); - sums = std::vector(ARRAY_SIZE/WGSIZE); + sums = std::vector(DOT_NUM_GROUPS); } template @@ -202,8 +205,8 @@ template T OCLStream::dot() { (*dot_kernel)( - cl::EnqueueArgs(queue, cl::NDRange(array_size), cl::NDRange(WGSIZE)), - d_a, d_b, d_sum, cl::Local(sizeof(T) * WGSIZE) + cl::EnqueueArgs(queue, cl::NDRange(DOT_NUM_GROUPS*DOT_WGSIZE), cl::NDRange(DOT_WGSIZE)), + d_a, d_b, d_sum, cl::Local(sizeof(T) * DOT_WGSIZE), array_size ); cl::copy(queue, d_sum, sums.begin(), sums.end()); diff --git a/OCLStream.h b/OCLStream.h index 97e3f93..20e5049 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -21,8 +21,9 @@ #define IMPLEMENTATION_STRING "OpenCL" -// Local work-group size for dot kernel -#define WGSIZE 256 +// NDRange configuration for the dot kernel +#define DOT_WGSIZE 256 +#define DOT_NUM_GROUPS 256 template class OCLStream : public Stream @@ -30,7 +31,7 @@ class OCLStream : public Stream protected: // Size of arrays unsigned int array_size; - + // Host array for partial sums for dot kernel std::vector sums; @@ -49,7 +50,7 @@ class OCLStream : public Stream cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; cl::KernelFunctor *triad_kernel; - cl::KernelFunctor *dot_kernel; + cl::KernelFunctor *dot_kernel; public: