From 2085cacea0349bd51f0256f9be728345d9adc5a7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 17:07:55 +0100 Subject: [PATCH] Add an OpenCL dot kernel We have to name the kernel stream_dot (for example) because the "dot" kernel already exists. --- OCLStream.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ OCLStream.h | 9 +++++++++ 2 files changed, 50 insertions(+) diff --git a/OCLStream.cpp b/OCLStream.cpp index 0ed4b8e..cef5fa6 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -50,6 +50,29 @@ std::string kernels{R"CLC( a[i] = b[i] + scalar * c[i]; } + kernel void stream_dot( + global const TYPE * restrict a, + global const TYPE * restrict b, + global TYPE * restrict sum, + local TYPE * restrict wg_sum) + { + const size_t i = get_global_id(0); + const size_t local_i = get_local_id(0); + wg_sum[local_i] = a[i] * b[i]; + + for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (local_i < offset) + { + wg_sum[local_i] += wg_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[get_group_id(0)] = wg_sum[local_i]; + } + )CLC"}; @@ -99,6 +122,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); + dot_kernel = new cl::KernelFunctor(program, "stream_dot"); array_size = ARRAY_SIZE; @@ -114,6 +138,7 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * WGSIZE); } @@ -166,6 +191,22 @@ void OCLStream::triad() queue.finish(); } +template +T OCLStream::dot() +{ + (*dot_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size), cl::NDRange(WGSIZE)), + d_a, d_b, d_sum, cl::Local(sizeof(T) * WGSIZE) + ); + cl::copy(queue, d_sum, sums.begin(), sums.end()); + + T sum = 0.0; + for (T val : sums) + sum += val; + + return sum; +} + template void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { diff --git a/OCLStream.h b/OCLStream.h index cb48da5..2f8193a 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -20,17 +20,24 @@ #define IMPLEMENTATION_STRING "OpenCL" +// Local work-group size for dot kernel +#define WGSIZE 1024 + template class OCLStream : public Stream { protected: // Size of arrays unsigned int array_size; + + // Host array for partial sums for dot kernel + std::vector sums; // Device side pointers to arrays cl::Buffer d_a; cl::Buffer d_b; cl::Buffer d_c; + cl::Buffer d_sum; // OpenCL objects cl::Device device; @@ -41,6 +48,7 @@ class OCLStream : public Stream cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; cl::KernelFunctor *triad_kernel; + cl::KernelFunctor *dot_kernel; public: @@ -51,6 +59,7 @@ class OCLStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override;