Set thread block size in CUDA with a #define, and check that array size is multiple of it

This commit is contained in:
Tom Deakin 2016-05-11 12:21:29 +01:00
parent 207fd8f784
commit 2462023ed9
2 changed files with 15 additions and 6 deletions

View File

@ -8,6 +8,8 @@
#include "CUDAStream.h" #include "CUDAStream.h"
#define TBSIZE 1024
void check_error(void) void check_error(void)
{ {
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
@ -22,6 +24,14 @@ template <class T>
CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index)
{ {
// The array size must be divisible by TBSIZE for kernel launches
if (ARRAY_SIZE % TBSIZE != 0)
{
std::stringstream ss;
ss << "Array size must be a multiple of " << TBSIZE;
throw std::runtime_error(ss.str());
}
// Set device // Set device
int count; int count;
cudaGetDeviceCount(&count); cudaGetDeviceCount(&count);
@ -99,7 +109,7 @@ __global__ void copy_kernel(const T * a, T * c)
template <class T> template <class T>
void CUDAStream<T>::copy() void CUDAStream<T>::copy()
{ {
copy_kernel<<<array_size/1024, 1024>>>(d_a, d_c); copy_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_c);
check_error(); check_error();
cudaDeviceSynchronize(); cudaDeviceSynchronize();
check_error(); check_error();
@ -116,7 +126,7 @@ __global__ void mul_kernel(T * b, const T * c)
template <class T> template <class T>
void CUDAStream<T>::mul() void CUDAStream<T>::mul()
{ {
mul_kernel<<<array_size/1024, 1024>>>(d_b, d_c); mul_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_b, d_c);
check_error(); check_error();
cudaDeviceSynchronize(); cudaDeviceSynchronize();
check_error(); check_error();
@ -132,7 +142,7 @@ __global__ void add_kernel(const T * a, const T * b, T * c)
template <class T> template <class T>
void CUDAStream<T>::add() void CUDAStream<T>::add()
{ {
add_kernel<<<array_size/1024, 1024>>>(d_a, d_b, d_c); add_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error(); check_error();
cudaDeviceSynchronize(); cudaDeviceSynchronize();
check_error(); check_error();
@ -149,7 +159,7 @@ __global__ void triad_kernel(T * a, const T * b, const T * c)
template <class T> template <class T>
void CUDAStream<T>::triad() void CUDAStream<T>::triad()
{ {
triad_kernel<<<array_size/1024, 1024>>>(d_a, d_b, d_c); triad_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error(); check_error();
cudaDeviceSynchronize(); cudaDeviceSynchronize();
check_error(); check_error();
@ -202,4 +212,3 @@ std::string getDeviceDriver(const int device)
template class CUDAStream<float>; template class CUDAStream<float>;
template class CUDAStream<double>; template class CUDAStream<double>;

View File

@ -9,6 +9,7 @@
#include <iostream> #include <iostream>
#include <stdexcept> #include <stdexcept>
#include <sstream>
#include "Stream.h" #include "Stream.h"
@ -40,4 +41,3 @@ class CUDAStream : public Stream<T>
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override; virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
}; };