// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // // For full license terms please see the LICENSE file distributed with this // source code #include "OCLStream.h" // Cache list of devices bool cached = false; std::vector devices; void getDeviceList(void); std::string kernels{R"CLC( constant TYPE scalar = startScalar; kernel void init( global TYPE * restrict a, global TYPE * restrict b, global TYPE * restrict c, TYPE initA, TYPE initB, TYPE initC) { const size_t i = get_global_id(0); a[i] = initA; b[i] = initB; c[i] = initC; } kernel void copy( global const TYPE * restrict a, global TYPE * restrict c) { const size_t i = get_global_id(0); c[i] = a[i]; } kernel void mul( global TYPE * restrict b, global const TYPE * restrict c) { const size_t i = get_global_id(0); b[i] = scalar * c[i]; } kernel void add( global const TYPE * restrict a, global const TYPE * restrict b, global TYPE * restrict c) { const size_t i = get_global_id(0); c[i] = a[i] + b[i]; } kernel void triad( global TYPE * restrict a, global const TYPE * restrict b, global const TYPE * restrict c) { const size_t i = get_global_id(0); a[i] = b[i] + scalar * c[i]; } kernel void stream_dot( global const TYPE * restrict a, global const TYPE * restrict b, global TYPE * restrict sum, local TYPE * restrict wg_sum, int array_size) { size_t i = get_global_id(0); const size_t local_i = get_local_id(0); wg_sum[local_i] = 0.0; for (; i < array_size; i += get_global_size(0)) wg_sum[local_i] += a[i] * b[i]; for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) { barrier(CLK_LOCAL_MEM_FENCE); if (local_i < offset) { wg_sum[local_i] += wg_sum[local_i+offset]; } } if (local_i == 0) sum[get_group_id(0)] = wg_sum[local_i]; } )CLC"}; template OCLStream::OCLStream(const int ARRAY_SIZE, const int device_index) { if (!cached) getDeviceList(); // Setup default OpenCL GPU if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); device = devices[device_index]; // Determine sensible dot kernel NDRange configuration if (device.getInfo() & CL_DEVICE_TYPE_CPU) { dot_num_groups = device.getInfo(); dot_wgsize = device.getInfo() * 2; } else { dot_num_groups = device.getInfo() * 4; dot_wgsize = device.getInfo(); } // Print out device information std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; context = cl::Context(device); queue = cl::CommandQueue(context); // Create program cl::Program program(context, kernels); std::ostringstream args; args << "-DstartScalar=" << startScalar << " "; if (sizeof(T) == sizeof(double)) { args << "-DTYPE=double"; // Check device can do double if (!device.getInfo()) throw std::runtime_error("Device does not support double precision, please use --float"); try { program.build(args.str().c_str()); } catch (cl::Error& err) { if (err.err() == CL_BUILD_PROGRAM_FAILURE) { std::cout << program.getBuildInfo()[0].second << std::endl; throw err; } } } else if (sizeof(T) == sizeof(float)) { args << "-DTYPE=float"; program.build(args.str().c_str()); } // Create kernels init_kernel = new cl::KernelFunctor(program, "init"); copy_kernel = new cl::KernelFunctor(program, "copy"); mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); dot_kernel = new cl::KernelFunctor(program, "stream_dot"); array_size = ARRAY_SIZE; // Check buffers fit on the device cl_ulong totalmem = device.getInfo(); cl_ulong maxbuffer = device.getInfo(); if (maxbuffer < sizeof(T)*ARRAY_SIZE) throw std::runtime_error("Device cannot allocate a buffer big enough"); if (totalmem < 3*sizeof(T)*ARRAY_SIZE) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create buffers d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups); sums = std::vector(dot_num_groups); } template OCLStream::~OCLStream() { delete init_kernel; delete copy_kernel; delete mul_kernel; delete add_kernel; delete triad_kernel; delete dot_kernel; devices.clear(); } template void OCLStream::copy() { (*copy_kernel)( cl::EnqueueArgs(queue, cl::NDRange(array_size)), d_a, d_c ); queue.finish(); } template void OCLStream::mul() { (*mul_kernel)( cl::EnqueueArgs(queue, cl::NDRange(array_size)), d_b, d_c ); queue.finish(); } template void OCLStream::add() { (*add_kernel)( cl::EnqueueArgs(queue, cl::NDRange(array_size)), d_a, d_b, d_c ); queue.finish(); } template void OCLStream::triad() { (*triad_kernel)( cl::EnqueueArgs(queue, cl::NDRange(array_size)), d_a, d_b, d_c ); queue.finish(); } template T OCLStream::dot() { (*dot_kernel)( cl::EnqueueArgs(queue, cl::NDRange(dot_num_groups*dot_wgsize), cl::NDRange(dot_wgsize)), d_a, d_b, d_sum, cl::Local(sizeof(T) * dot_wgsize), array_size ); cl::copy(queue, d_sum, sums.begin(), sums.end()); T sum = 0.0; for (T val : sums) sum += val; return sum; } template void OCLStream::init_arrays(T initA, T initB, T initC) { (*init_kernel)( cl::EnqueueArgs(queue, cl::NDRange(array_size)), d_a, d_b, d_c, initA, initB, initC ); queue.finish(); } template void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { cl::copy(queue, d_a, a.begin(), a.end()); cl::copy(queue, d_b, b.begin(), b.end()); cl::copy(queue, d_c, c.begin(), c.end()); } void getDeviceList(void) { // Get list of platforms std::vector platforms; cl::Platform::get(&platforms); // Enumerate devices for (unsigned i = 0; i < platforms.size(); i++) { std::vector plat_devices; platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); } cached = true; } void listDevices(void) { getDeviceList(); // Print device names if (devices.size() == 0) { std::cerr << "No devices found." << std::endl; } else { std::cout << std::endl; std::cout << "Devices:" << std::endl; for (int i = 0; i < devices.size(); i++) { std::cout << i << ": " << getDeviceName(i) << std::endl; } std::cout << std::endl; } } std::string getDeviceName(const int device) { if (!cached) getDeviceList(); std::string name; cl_device_info info = CL_DEVICE_NAME; if (device < devices.size()) { devices[device].getInfo(info, &name); } else { throw std::runtime_error("Error asking for name for non-existant device"); } return name; } std::string getDeviceDriver(const int device) { if (!cached) getDeviceList(); std::string driver; if (device < devices.size()) { devices[device].getInfo(CL_DRIVER_VERSION, &driver); } else { throw std::runtime_error("Error asking for driver for non-existant device"); } return driver; } template class OCLStream; template class OCLStream;