diff --git a/src/CUDAStream.cu b/src/CUDAStream.cu index 18e1a70..caf5e1a 100644 --- a/src/CUDAStream.cu +++ b/src/CUDAStream.cu @@ -14,6 +14,8 @@ void check_error(void) template CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE) { + array_size = ARRAY_SIZE; + // Create device buffers cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); check_error(); @@ -59,7 +61,7 @@ __global__ void copy_kernel(const T * a, T * c) template void CUDAStream::copy() { - copy_kernel<<<1024, 1024>>>(d_a, d_c); + copy_kernel<<>>(d_a, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -76,7 +78,7 @@ __global__ void mul_kernel(T * b, const T * c) template void CUDAStream::mul() { - mul_kernel<<<1024, 1024>>>(d_b, d_c); + mul_kernel<<>>(d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -92,7 +94,7 @@ __global__ void add_kernel(const T * a, const T * b, T * c) template void CUDAStream::add() { - add_kernel<<<1024, 1024>>>(d_a, d_b, d_c); + add_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); @@ -109,7 +111,7 @@ __global__ void triad_kernel(T * a, const T * b, const T * c) template void CUDAStream::triad() { - triad_kernel<<<1024, 1024>>>(d_a, d_b, d_c); + triad_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); diff --git a/src/CUDAStream.h b/src/CUDAStream.h index 881811e..14f2cc2 100644 --- a/src/CUDAStream.h +++ b/src/CUDAStream.h @@ -7,6 +7,8 @@ template class CUDAStream : public Stream { private: + // Size of arrays + unsigned int array_size; // Device side pointers to arrays T *d_a; T *d_b;