From 8a8f44b4ce4bf9ff1cba08787d25aa2e1a1182f1 Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 24 Oct 2016 12:47:25 +0100 Subject: [PATCH] Fix CUDA host code for dot kernel Wrong number of blocks was being copied and summed. --- CUDAStream.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 515540f..8a74fcb 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -46,7 +46,7 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * TBSIZE); + sums = (T*)malloc(sizeof(T) * (ARRAY_SIZE/TBSIZE)); // Check buffers fit on the device cudaDeviceProp props; @@ -61,7 +61,7 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); - cudaMalloc(&d_sum, TBSIZE*sizeof(T)); + cudaMalloc(&d_sum, (ARRAY_SIZE/TBSIZE)*sizeof(T)); check_error(); } @@ -201,11 +201,11 @@ T CUDAStream::dot() dot_kernel<<>>(d_a, d_b, d_sum); check_error(); - cudaMemcpy(sums, d_sum, TBSIZE*sizeof(T), cudaMemcpyDeviceToHost); + cudaMemcpy(sums, d_sum, (array_size/TBSIZE)*sizeof(T), cudaMemcpyDeviceToHost); check_error(); T sum = 0.0; - for (int i = 0; i < TBSIZE; i++) + for (int i = 0; i < (array_size/TBSIZE); i++) sum += sums[i]; return sum;