// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code #include "CUDAStream.h" #define TBSIZE 1024 void check_error(void) { cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { std::cerr << "Error: " << cudaGetErrorString(err) << std::endl; exit(err); } } template CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) { // The array size must be divisible by TBSIZE for kernel launches if (ARRAY_SIZE % TBSIZE != 0) { std::stringstream ss; ss << "Array size must be a multiple of " << TBSIZE; throw std::runtime_error(ss.str()); } // Set device int count; cudaGetDeviceCount(&count); check_error(); if (device_index >= count) throw std::runtime_error("Invalid device index"); cudaSetDevice(device_index); check_error(); // Print out device information std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; // Check buffers fit on the device cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); check_error(); cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T)); check_error(); cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); } template CUDAStream::~CUDAStream() { cudaFree(d_a); check_error(); cudaFree(d_b); check_error(); cudaFree(d_c); check_error(); } template void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) { // Copy host memory to device cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); check_error(); cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); check_error(); cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice); check_error(); } template void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { // Copy device memory to host cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost); check_error(); cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost); check_error(); cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost); check_error(); } template __global__ void copy_kernel(const T * a, T * c) { const int i = blockDim.x * blockIdx.x + threadIdx.x; c[i] = a[i]; } template void CUDAStream::copy() { copy_kernel<<>>(d_a, d_c); check_error(); cudaDeviceSynchronize(); check_error(); } template __global__ void mul_kernel(T * b, const T * c) { const T scalar = 0.3; const int i = blockDim.x * blockIdx.x + threadIdx.x; b[i] = scalar * c[i]; } template void CUDAStream::mul() { mul_kernel<<>>(d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); } template __global__ void add_kernel(const T * a, const T * b, T * c) { const int i = blockDim.x * blockIdx.x + threadIdx.x; c[i] = a[i] + b[i]; } template void CUDAStream::add() { add_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); } template __global__ void triad_kernel(T * a, const T * b, const T * c) { const T scalar = 0.3; const int i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = b[i] + scalar * c[i]; } template void CUDAStream::triad() { triad_kernel<<>>(d_a, d_b, d_c); check_error(); cudaDeviceSynchronize(); check_error(); } void listDevices(void) { // Get number of devices int count; cudaGetDeviceCount(&count); check_error(); // Print device names if (count == 0) { std::cerr << "No devices found." << std::endl; } else { std::cout << std::endl; std::cout << "Devices:" << std::endl; for (int i = 0; i < count; i++) { std::cout << i << ": " << getDeviceName(i) << std::endl; } std::cout << std::endl; } } std::string getDeviceName(const int device) { cudaDeviceProp props; cudaGetDeviceProperties(&props, device); check_error(); return std::string(props.name); } std::string getDeviceDriver(const int device) { cudaSetDevice(device); check_error(); int driver; cudaDriverGetVersion(&driver); check_error(); return std::to_string(driver); } template class CUDAStream; template class CUDAStream;