diff --git a/CUDAStream.cu b/CUDAStream.cu index 3c10e8d..2a28f9c 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -8,6 +8,8 @@ #include "CUDAStream.h" +#define TBSIZE 1024 + void check_error(void) { cudaError_t err = cudaGetLastError(); @@ -22,6 +24,14 @@ template CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) { + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + // Set device int count; cudaGetDeviceCount(&count); @@ -99,7 +109,7 @@ __global__ void copy_kernel(const T * a, T * c) template void CUDAStream::copy() { - copy_kernel<<>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -116,7 +126,7 @@ __global__ void mul_kernel(T * b, const T * c) template void CUDAStream::mul() { - mul_kernel<<>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -132,7 +142,7 @@ __global__ void add_kernel(const T * a, const T * b, T * c) template void CUDAStream::add() { - add_kernel<<>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -149,7 +159,7 @@ __global__ void triad_kernel(T * a, const T * b, const T * c) template void CUDAStream::triad() { - triad_kernel<<>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -202,4 +212,3 @@ std::string getDeviceDriver(const int device) template class CUDAStream; template class CUDAStream; - diff --git a/CUDAStream.h b/CUDAStream.h index 9c436d6..6904a86 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -9,6 +9,7 @@ #include #include +#include #include "Stream.h" @@ -40,4 +41,3 @@ class CUDAStream : public Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; -