BabelStream/CUDAStream.cu
Tom Deakin 31cb567e21 Switch data from 1.0, 2.0 and 3.0 to 0.1, 0.2, and 0.3 resp.
Using integers for maths gets unstable past 38 interations even
in double precision. Using the original values/10 is safe up to
the default 100 iterations.
2016-05-11 15:51:19 +01:00

215 lines
4.8 KiB
Plaintext

// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "CUDAStream.h"
#define TBSIZE 1024
void check_error(void)
{
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Error: " << cudaGetErrorString(err) << std::endl;
exit(err);
}
}
template <class T>
CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index)
{
// The array size must be divisible by TBSIZE for kernel launches
if (ARRAY_SIZE % TBSIZE != 0)
{
std::stringstream ss;
ss << "Array size must be a multiple of " << TBSIZE;
throw std::runtime_error(ss.str());
}
// Set device
int count;
cudaGetDeviceCount(&count);
check_error();
if (device_index >= count)
throw std::runtime_error("Invalid device index");
cudaSetDevice(device_index);
check_error();
// Print out device information
std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
array_size = ARRAY_SIZE;
// Check buffers fit on the device
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers
cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
check_error();
cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T));
check_error();
cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T));
check_error();
}
template <class T>
CUDAStream<T>::~CUDAStream()
{
cudaFree(d_a);
check_error();
cudaFree(d_b);
check_error();
cudaFree(d_c);
check_error();
}
template <class T>
void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
// Copy host memory to device
cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice);
check_error();
cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice);
check_error();
cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
check_error();
}
template <class T>
void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
// Copy device memory to host
cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
}
template <typename T>
__global__ void copy_kernel(const T * a, T * c)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i];
}
template <class T>
void CUDAStream<T>::copy()
{
copy_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void mul_kernel(T * b, const T * c)
{
const T scalar = 0.3;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
b[i] = scalar * c[i];
}
template <class T>
void CUDAStream<T>::mul()
{
mul_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
template <class T>
void CUDAStream<T>::add()
{
add_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c)
{
const T scalar = 0.3;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = b[i] + scalar * c[i];
}
template <class T>
void CUDAStream<T>::triad()
{
triad_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
void listDevices(void)
{
// Get number of devices
int count;
cudaGetDeviceCount(&count);
check_error();
// Print device names
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
check_error();
return std::string(props.name);
}
std::string getDeviceDriver(const int device)
{
cudaSetDevice(device);
check_error();
int driver;
cudaDriverGetVersion(&driver);
check_error();
return std::to_string(driver);
}
template class CUDAStream<float>;
template class CUDAStream<double>;