From d3b497a9ca359e68c68cc461f16021fd103c2799 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 14 Oct 2016 17:51:40 +0100 Subject: [PATCH] Add a CUDA dot kernel --- CUDAStream.cu | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- CUDAStream.h | 8 ++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 802bb05..515540f 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -8,8 +8,6 @@ #include "CUDAStream.h" -#define TBSIZE 1024 - void check_error(void) { cudaError_t err = cudaGetLastError(); @@ -47,6 +45,9 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; + // Allocate the host array for partial sums for dot kernels + sums = (T*)malloc(sizeof(T) * TBSIZE); + // Check buffers fit on the device cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); @@ -60,12 +61,16 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); + cudaMalloc(&d_sum, TBSIZE*sizeof(T)); + check_error(); } template CUDAStream::~CUDAStream() { + free(sums); + cudaFree(d_a); check_error(); cudaFree(d_b); @@ -165,6 +170,46 @@ void CUDAStream::triad() check_error(); } +template +__global__ void dot_kernel(const T * a, const T * b, T * sum) +{ + + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T *tb_sum = reinterpret_cast(smem); + + const int i = blockDim.x * blockIdx.x + threadIdx.x; + const size_t local_i = threadIdx.x; + + tb_sum[local_i] = a[i] * b[i]; + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + { + __syncthreads(); + if (local_i < offset) + { + tb_sum[local_i] += tb_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; +} + +template +T CUDAStream::dot() +{ + dot_kernel<<>>(d_a, d_b, d_sum); + check_error(); + + cudaMemcpy(sums, d_sum, TBSIZE*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + + T sum = 0.0; + for (int i = 0; i < TBSIZE; i++) + sum += sums[i]; + + return sum; +} void listDevices(void) { diff --git a/CUDAStream.h b/CUDAStream.h index 6904a86..09a72b0 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -15,16 +15,23 @@ #define IMPLEMENTATION_STRING "CUDA" +#define TBSIZE 1024 + template class CUDAStream : public Stream { protected: // Size of arrays unsigned int array_size; + + // Host array for partial sums for dot kernel + T *sums; + // Device side pointers to arrays T *d_a; T *d_b; T *d_c; + T *d_sum; public: @@ -36,6 +43,7 @@ class CUDAStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override;