diff --git a/.gitignore b/.gitignore index 4d2865c..6ef20a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ -common.h - gpu-stream-cuda gpu-stream-ocl gpu-stream-acc diff --git a/ACCStream.cpp b/ACCStream.cpp index bd49663..0e591a8 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -36,13 +36,19 @@ ACCStream::~ACCStream() } template -void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void ACCStream::init_arrays(T initA, T initB, T initC) { - T *a = this->a; - T *b = this->b; - T *c = this->c; - #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template @@ -112,6 +118,24 @@ void ACCStream::triad() a[i] = b[i] + scalar * c[i]; } } + +template +T ACCStream::dot() +{ + T sum = 0.0; + + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + #pragma acc kernels present(a[0:array_size], b[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + sum += a[i] * b[i]; + } + + return sum; +} + void listDevices(void) { // Get number of devices diff --git a/ACCStream.h b/ACCStream.h index 48fea55..8f13ed7 100644 --- a/ACCStream.h +++ b/ACCStream.h @@ -35,8 +35,9 @@ class ACCStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/CMakeLists.txt b/CMakeLists.txt index efee733..6f3439e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ include(CheckIncludeFileCXX) include(CheckCXXCompilerFlag) set(gpu-stream_VERSION_MAJOR 2) -set(gpu-stream_VERSION_MINOR 1) +set(gpu-stream_VERSION_MINOR 2) configure_file(common.h.in common.h) include_directories(${CMAKE_BINARY_DIR}) diff --git a/CUDAStream.cu b/CUDAStream.cu index ff2ec41..7b1e0df 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -8,8 +8,6 @@ #include "CUDAStream.h" -#define TBSIZE 1024 - void check_error(void) { cudaError_t err = cudaGetLastError(); @@ -47,6 +45,9 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) array_size = ARRAY_SIZE; + // Allocate the host array for partial sums for dot kernels + sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS); + // Check buffers fit on the device cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); @@ -60,29 +61,42 @@ CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) check_error(); cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); check_error(); + cudaMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T)); + check_error(); } template CUDAStream::~CUDAStream() { + free(sums); + cudaFree(d_a); check_error(); cudaFree(d_b); check_error(); cudaFree(d_c); check_error(); + cudaFree(d_sum); + check_error(); +} + + +template +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = initA; + b[i] = initB; + c[i] = initC; } template -void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void CUDAStream::init_arrays(T initA, T initB, T initC) { - // Copy host memory to device - cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); check_error(); - cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); - check_error(); - cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); check_error(); } @@ -165,6 +179,48 @@ void CUDAStream::triad() check_error(); } +template +__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size) +{ + + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T *tb_sum = reinterpret_cast(smem); + + int i = blockDim.x * blockIdx.x + threadIdx.x; + const size_t local_i = threadIdx.x; + + tb_sum[local_i] = 0.0; + for (; i < array_size; i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + { + __syncthreads(); + if (local_i < offset) + { + tb_sum[local_i] += tb_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; +} + +template +T CUDAStream::dot() +{ + dot_kernel<<>>(d_a, d_b, d_sum, array_size); + check_error(); + + cudaMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + + T sum = 0.0; + for (int i = 0; i < DOT_NUM_BLOCKS; i++) + sum += sums[i]; + + return sum; +} void listDevices(void) { diff --git a/CUDAStream.h b/CUDAStream.h index 6904a86..0a0236b 100644 --- a/CUDAStream.h +++ b/CUDAStream.h @@ -15,16 +15,24 @@ #define IMPLEMENTATION_STRING "CUDA" +#define TBSIZE 1024 +#define DOT_NUM_BLOCKS 256 + template class CUDAStream : public Stream { protected: // Size of arrays unsigned int array_size; + + // Host array for partial sums for dot kernel + T *sums; + // Device side pointers to arrays T *d_a; T *d_b; T *d_c; + T *d_sum; public: @@ -36,8 +44,9 @@ class CUDAStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/HIPStream.cu b/HIPStream.cu index 34ecfb6..8c02348 100644 --- a/HIPStream.cu +++ b/HIPStream.cu @@ -74,15 +74,21 @@ HIPStream::~HIPStream() check_error(); } -template -void HIPStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +template +__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC) { - // Copy host memory to device - hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice); + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + +template +void HIPStream::init_arrays(T initA, T initB, T initC) +{ + hipLaunchKernel(HIP_KERNEL_NAME(init_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c, initA, initB, initC); check_error(); - hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice); - check_error(); - hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice); + hipDeviceSynchronize(); check_error(); } diff --git a/HIPStream.h b/HIPStream.h index 9015e35..392080a 100644 --- a/HIPStream.h +++ b/HIPStream.h @@ -37,7 +37,7 @@ class HIPStream : public Stream virtual void mul() override; virtual void triad() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index 94ac7ee..9391a13 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -34,18 +34,18 @@ KOKKOSStream::~KOKKOSStream() } template -void KOKKOSStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void KOKKOSStream::init_arrays(T initA, T initB, T initC) { - for(int ii = 0; ii < array_size; ++ii) + View a(*d_a); + View b(*d_b); + View c(*d_c); + parallel_for(array_size, KOKKOS_LAMBDA (const int index) { - (*hm_a)(ii) = a[ii]; - (*hm_b)(ii) = b[ii]; - (*hm_c)(ii) = c[ii]; - } - deep_copy(*d_a, *hm_a); - deep_copy(*d_b, *hm_b); - deep_copy(*d_c, *hm_c); + a[index] = initA; + b[index] - initB; + c[index] = initC; + }); + Kokkos::fence(); } template @@ -121,6 +121,23 @@ void KOKKOSStream::triad() Kokkos::fence(); } +template +T KOKKOSStream::dot() +{ + View a(*d_a); + View b(*d_b); + + T sum = 0.0; + + parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp) + { + tmp += a[index] * b[index]; + }, sum); + + return sum; + +} + void listDevices(void) { std::cout << "This is not the device you are looking for."; diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp index d2b9665..b230f18 100644 --- a/KOKKOSStream.hpp +++ b/KOKKOSStream.hpp @@ -47,9 +47,9 @@ class KOKKOSStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OCLStream.cpp b/OCLStream.cpp index 2a1e5ee..7bc5a78 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -16,6 +16,18 @@ std::string kernels{R"CLC( constant TYPE scalar = startScalar; + kernel void init( + global TYPE * restrict a, + global TYPE * restrict b, + global TYPE * restrict c, + TYPE initA, TYPE initB, TYPE initC) + { + const size_t i = get_global_id(0); + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + kernel void copy( global const TYPE * restrict a, global TYPE * restrict c) @@ -50,6 +62,32 @@ std::string kernels{R"CLC( a[i] = b[i] + scalar * c[i]; } + kernel void stream_dot( + global const TYPE * restrict a, + global const TYPE * restrict b, + global TYPE * restrict sum, + local TYPE * restrict wg_sum, + int array_size) + { + size_t i = get_global_id(0); + const size_t local_i = get_local_id(0); + wg_sum[local_i] = 0.0; + for (; i < array_size; i += get_global_size(0)) + wg_sum[local_i] += a[i] * b[i]; + + for (int offset = get_local_size(0) / 2; offset > 0; offset /= 2) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (local_i < offset) + { + wg_sum[local_i] += wg_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[get_group_id(0)] = wg_sum[local_i]; + } + )CLC"}; @@ -64,9 +102,22 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device = devices[device_index]; + // Determine sensible dot kernel NDRange configuration + if (device.getInfo() & CL_DEVICE_TYPE_CPU) + { + dot_num_groups = device.getInfo(); + dot_wgsize = device.getInfo() * 2; + } + else + { + dot_num_groups = device.getInfo() * 4; + dot_wgsize = device.getInfo(); + } + // Print out device information std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; context = cl::Context(device); queue = cl::CommandQueue(context); @@ -101,10 +152,12 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) } // Create kernels + init_kernel = new cl::KernelFunctor(program, "init"); copy_kernel = new cl::KernelFunctor(program, "copy"); mul_kernel = new cl::KernelFunctor(program, "mul"); add_kernel = new cl::KernelFunctor(program, "add"); triad_kernel = new cl::KernelFunctor(program, "triad"); + dot_kernel = new cl::KernelFunctor(program, "stream_dot"); array_size = ARRAY_SIZE; @@ -120,12 +173,15 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups); + sums = std::vector(dot_num_groups); } template OCLStream::~OCLStream() { + delete init_kernel; delete copy_kernel; delete mul_kernel; delete add_kernel; @@ -173,11 +229,29 @@ void OCLStream::triad() } template -void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +T OCLStream::dot() { - cl::copy(queue, a.begin(), a.end(), d_a); - cl::copy(queue, b.begin(), b.end(), d_b); - cl::copy(queue, c.begin(), c.end(), d_c); + (*dot_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(dot_num_groups*dot_wgsize), cl::NDRange(dot_wgsize)), + d_a, d_b, d_sum, cl::Local(sizeof(T) * dot_wgsize), array_size + ); + cl::copy(queue, d_sum, sums.begin(), sums.end()); + + T sum = 0.0; + for (T val : sums) + sum += val; + + return sum; +} + +template +void OCLStream::init_arrays(T initA, T initB, T initC) +{ + (*init_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c, initA, initB, initC + ); + queue.finish(); } template diff --git a/OCLStream.h b/OCLStream.h index 54abaa3..fbeff9a 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -28,20 +28,30 @@ class OCLStream : public Stream // Size of arrays unsigned int array_size; + // Host array for partial sums for dot kernel + std::vector sums; + // Device side pointers to arrays cl::Buffer d_a; cl::Buffer d_b; cl::Buffer d_c; + cl::Buffer d_sum; // OpenCL objects cl::Device device; cl::Context context; cl::CommandQueue queue; + cl::KernelFunctor *init_kernel; cl::KernelFunctor *copy_kernel; cl::KernelFunctor * mul_kernel; cl::KernelFunctor *add_kernel; cl::KernelFunctor *triad_kernel; + cl::KernelFunctor *dot_kernel; + + // NDRange configuration for the dot kernel + size_t dot_num_groups; + size_t dot_wgsize; public: @@ -52,8 +62,9 @@ class OCLStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp deleted file mode 100644 index f578c7c..0000000 --- a/OMP3Stream.cpp +++ /dev/null @@ -1,111 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "OMP3Stream.h" - -template -OMP3Stream::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) -{ - array_size = ARRAY_SIZE; - this->a = (T*)malloc(sizeof(T)*array_size); - this->b = (T*)malloc(sizeof(T)*array_size); - this->c = (T*)malloc(sizeof(T)*array_size); -} - -template -OMP3Stream::~OMP3Stream() -{ - free(a); - free(b); - free(c); -} - - -template -void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - a[i] = h_a[i]; - b[i] = h_b[i]; - c[i] = h_c[i]; - } -} - -template -void OMP3Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } -} - -template -void OMP3Stream::copy() -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - c[i] = a[i]; - } -} - -template -void OMP3Stream::mul() -{ - const T scalar = startScalar; - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - b[i] = scalar * c[i]; - } -} - -template -void OMP3Stream::add() -{ - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - c[i] = a[i] + b[i]; - } -} - -template -void OMP3Stream::triad() -{ - const T scalar = startScalar; - #pragma omp parallel for - for (int i = 0; i < array_size; i++) - { - a[i] = b[i] + scalar * c[i]; - } -} - -void listDevices(void) -{ - std::cout << "0: CPU" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} - - -template class OMP3Stream; -template class OMP3Stream; diff --git a/OMP3Stream.h b/OMP3Stream.h deleted file mode 100644 index 0f14300..0000000 --- a/OMP3Stream.h +++ /dev/null @@ -1,40 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once - -#include -#include - -#include "Stream.h" - -#define IMPLEMENTATION_STRING "Reference OpenMP" - -template -class OMP3Stream : public Stream -{ - protected: - // Size of arrays - unsigned int array_size; - // Device side pointers - T *a; - T *b; - T *c; - - public: - OMP3Stream(const unsigned int, T*, T*, T*); - ~OMP3Stream(); - - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; - -}; diff --git a/OMP45Stream.cpp b/OMPStream.cpp similarity index 55% rename from OMP45Stream.cpp rename to OMPStream.cpp index 8f684e2..189cacb 100644 --- a/OMP45Stream.cpp +++ b/OMPStream.cpp @@ -5,26 +5,33 @@ // For full license terms please see the LICENSE file distributed with this // source code -#include "OMP45Stream.h" +#include "OMPStream.h" template -OMP45Stream::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) +OMPStream::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) { - omp_set_default_device(device); - array_size = ARRAY_SIZE; +#ifdef OMP_TARGET_GPU + omp_set_default_device(device); // Set up data region on device this->a = a; this->b = b; this->c = c; - #pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size]) {} +#else + // Allocate on the host + this->a = (T*)malloc(sizeof(T)*array_size); + this->b = (T*)malloc(sizeof(T)*array_size); + this->c = (T*)malloc(sizeof(T)*array_size); +#endif } template -OMP45Stream::~OMP45Stream() +OMPStream::~OMPStream() { +#ifdef OMP_TARGET_GPU // End data region on device unsigned int array_size = this->array_size; T *a = this->a; @@ -32,35 +39,64 @@ OMP45Stream::~OMP45Stream() T *c = this->c; #pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size]) {} +#else + free(a); + free(b); + free(c); +#endif } template -void OMP45Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +void OMPStream::init_arrays(T initA, T initB, T initC) { + unsigned int array_size = this->array_size; +#ifdef OMP_TARGET_GPU T *a = this->a; T *b = this->b; T *c = this->c; - #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size]) - {} + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } } template -void OMP45Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { +#ifdef OMP_TARGET_GPU T *a = this->a; T *b = this->b; T *c = this->c; #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size]) {} +#else + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +#endif } template -void OMP45Stream::copy() +void OMPStream::copy() { +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *a = this->a; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { c[i] = a[i]; @@ -68,14 +104,18 @@ void OMP45Stream::copy() } template -void OMP45Stream::mul() +void OMPStream::mul() { const T scalar = startScalar; +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { b[i] = scalar * c[i]; @@ -83,13 +123,17 @@ void OMP45Stream::mul() } template -void OMP45Stream::add() +void OMPStream::add() { +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { c[i] = a[i] + b[i]; @@ -97,22 +141,51 @@ void OMP45Stream::add() } template -void OMP45Stream::triad() +void OMPStream::triad() { const T scalar = startScalar; +#ifdef OMP_TARGET_GPU unsigned int array_size = this->array_size; T *a = this->a; T *b = this->b; T *c = this->c; #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) +#else + #pragma omp parallel for +#endif for (int i = 0; i < array_size; i++) { a[i] = b[i] + scalar * c[i]; } } + +template +T OMPStream::dot() +{ + T sum = 0.0; + +#ifdef OMP_TARGET_GPU + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + #pragma omp target teams distribute parallel for simd reduction(+:sum) map(tofrom: sum) +#else + #pragma omp parallel for reduction(+:sum) +#endif + for (int i = 0; i < array_size; i++) + { + sum += a[i] * b[i]; + } + + return sum; +} + + + void listDevices(void) { +#ifdef OMP_TARGET_GPU // Get number of devices int count = omp_get_num_devices(); @@ -125,6 +198,9 @@ void listDevices(void) { std::cout << "There are " << count << " devices." << std::endl; } +#else + std::cout << "0: CPU" << std::endl; +#endif } std::string getDeviceName(const int) @@ -136,5 +212,5 @@ std::string getDeviceDriver(const int) { return std::string("Device driver unavailable"); } -template class OMP45Stream; -template class OMP45Stream; +template class OMPStream; +template class OMPStream; diff --git a/OMP45Stream.h b/OMPStream.h similarity index 71% rename from OMP45Stream.h rename to OMPStream.h index bd812a1..c475274 100644 --- a/OMP45Stream.h +++ b/OMPStream.h @@ -14,10 +14,10 @@ #include -#define IMPLEMENTATION_STRING "OpenMP 4.5" +#define IMPLEMENTATION_STRING "OpenMP" template -class OMP45Stream : public Stream +class OMPStream : public Stream { protected: // Size of arrays @@ -29,15 +29,16 @@ class OMP45Stream : public Stream T *c; public: - OMP45Stream(const unsigned int, T*, T*, T*, int); - ~OMP45Stream(); + OMPStream(const unsigned int, T*, T*, T*, int); + ~OMPStream(); virtual void copy() override; virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; diff --git a/RAJAStream.cpp b/RAJAStream.cpp index 33687a1..240f160 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -21,12 +21,6 @@ RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) d_a = new T[ARRAY_SIZE]; d_b = new T[ARRAY_SIZE]; d_c = new T[ARRAY_SIZE]; - forall(index_set, [=] RAJA_DEVICE (int index) - { - d_a[index] = 0.0; - d_b[index] = 0.0; - d_c[index] = 0.0; - }); #else cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); @@ -50,12 +44,17 @@ RAJAStream::~RAJAStream() } template -void RAJAStream::write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) +void RAJAStream::init_arrays(T initA, T initB, T initC) { - std::copy(a.begin(), a.end(), d_a); - std::copy(b.begin(), b.end(), d_b); - std::copy(c.begin(), c.end(), d_c); + T* a = d_a; + T* b = d_b; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + a[index] = initA; + b[index] = initB; + c[index] = initC; + }); } template @@ -115,6 +114,23 @@ void RAJAStream::triad() }); } +template +T RAJAStream::dot() +{ + T* a = d_a; + T* b = d_b; + + RAJA::ReduceSum sum(0.0); + + forall(index_set, [=] RAJA_DEVICE (int index) + { + sum += a[index] * b[index]; + }); + + return T(sum); +} + + void listDevices(void) { std::cout << "This is not the device you are looking for."; diff --git a/RAJAStream.hpp b/RAJAStream.hpp index 454e20e..b5cb586 100644 --- a/RAJAStream.hpp +++ b/RAJAStream.hpp @@ -18,11 +18,13 @@ typedef RAJA::IndexSet::ExecPolicy< RAJA::seq_segit, RAJA::omp_parallel_for_exec> policy; +typedef RAJA::omp_reduce reduce_policy; #else const size_t block_size = 128; typedef RAJA::IndexSet::ExecPolicy< RAJA::seq_segit, RAJA::cuda_exec> policy; +typedef RAJA::cuda_reduce reduce_policy; #endif template @@ -49,9 +51,9 @@ class RAJAStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays( - const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays( std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 12e96b4..abe048c 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -18,14 +18,6 @@ std::vector devices; void getDeviceList(void); program * p; -/* Forward declaration of SYCL kernels */ -namespace kernels { - class copy; - class mul; - class add; - class triad; -} - template SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) { @@ -38,24 +30,39 @@ SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; + // Determine sensible dot kernel NDRange configuration + if (dev.is_cpu()) + { + dot_num_groups = dev.get_info(); + dot_wgsize = dev.get_info() * 2; + } + else + { + dot_num_groups = dev.get_info() * 4; + dot_wgsize = dev.get_info(); + } + // Print out device information std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + std::cout << "Reduction kernel config: " << dot_num_groups << " groups of size " << dot_wgsize << std::endl; queue = new cl::sycl::queue(dev); /* Pre-build the kernels */ p = new program(queue->get_context()); - p->build_from_kernel_name(); - p->build_from_kernel_name(); - p->build_from_kernel_name(); - p->build_from_kernel_name(); - + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); + p->build_from_kernel_name(); // Create buffers d_a = new buffer(array_size); d_b = new buffer(array_size); d_c = new buffer(array_size); + d_sum = new buffer(dot_num_groups); } template @@ -64,6 +71,7 @@ SYCLStream::~SYCLStream() delete d_a; delete d_b; delete d_c; + delete d_sum; delete p; delete queue; @@ -76,11 +84,11 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); - kc[id[0]] = ka[id[0]]; + auto id = item.get()[0]; + kc[id] = ka[id]; }); }); queue->wait(); @@ -94,11 +102,11 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); - kb[id[0]] = scalar * kc[id[0]]; + auto id = item.get()[0]; + kb[id] = scalar * kc[id]; }); }); queue->wait(); @@ -112,11 +120,11 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); - kc[id[0]] = ka[id[0]] + kb[id[0]]; + auto id = item.get()[0]; + kc[id] = ka[id] + kb[id]; }); }); queue->wait(); @@ -131,28 +139,81 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(p->get_kernel(), + cgh.parallel_for(p->get_kernel(), range<1>{array_size}, [=](item<1> item) { - auto id = item.get(); - ka[id] = kb[id[0]] + scalar * kc[id[0]]; + auto id = item.get()[0]; + ka[id] = kb[id] + scalar * kc[id]; }); }); queue->wait(); } template -void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +T SYCLStream::dot() { - auto _a = d_a->template get_access(); - auto _b = d_b->template get_access(); - auto _c = d_c->template get_access(); - for (int i = 0; i < array_size; i++) + queue->submit([&](handler &cgh) { - _a[i] = a[i]; - _b[i] = b[i]; - _c[i] = c[i]; + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto ksum = d_sum->template get_access(cgh); + + auto wg_sum = accessor(range<1>(dot_wgsize), cgh); + + size_t N = array_size; + + cgh.parallel_for(p->get_kernel(), + nd_range<1>(dot_num_groups*dot_wgsize, dot_wgsize), [=](nd_item<1> item) + { + size_t i = item.get_global(0); + size_t li = item.get_local(0); + size_t global_size = item.get_global_range()[0]; + + wg_sum[li] = 0.0; + for (; i < N; i += global_size) + wg_sum[li] += ka[i] * kb[i]; + + size_t local_size = item.get_local_range()[0]; + for (int offset = local_size / 2; offset > 0; offset /= 2) + { + item.barrier(cl::sycl::access::fence_space::local_space); + if (li < offset) + wg_sum[li] += wg_sum[li + offset]; + } + + if (li == 0) + ksum[item.get_group(0)] = wg_sum[0]; + }); + }); + + T sum = 0.0; + auto h_sum = d_sum->template get_access(); + for (int i = 0; i < dot_num_groups; i++) + { + sum += h_sum[i]; } + + return sum; +} + +template +void SYCLStream::init_arrays(T initA, T initB, T initC) +{ + queue->submit([&](handler &cgh) + { + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(p->get_kernel(), + range<1>{array_size}, [=](item<1> item) + { + auto id = item.get()[0]; + ka[id] = initA; + kb[id] = initB; + kc[id] = initC; + }); + }); + queue->wait(); } template @@ -244,5 +305,5 @@ std::string getDeviceDriver(const int device) // TODO: Fix kernel names to allow multiple template specializations -//template class SYCLStream; +template class SYCLStream; template class SYCLStream; diff --git a/SYCLStream.h b/SYCLStream.h index 8bc515d..ab62ecd 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -15,6 +15,16 @@ #define IMPLEMENTATION_STRING "SYCL" +namespace sycl_kernels +{ + template class init; + template class copy; + template class mul; + template class add; + template class triad; + template class dot; +} + template class SYCLStream : public Stream { @@ -27,6 +37,19 @@ class SYCLStream : public Stream cl::sycl::buffer *d_a; cl::sycl::buffer *d_b; cl::sycl::buffer *d_c; + cl::sycl::buffer *d_sum; + + // SYCL kernel names + typedef sycl_kernels::init init_kernel; + typedef sycl_kernels::copy copy_kernel; + typedef sycl_kernels::mul mul_kernel; + typedef sycl_kernels::add add_kernel; + typedef sycl_kernels::triad triad_kernel; + typedef sycl_kernels::dot dot_kernel; + + // NDRange configuration for the dot kernel + size_t dot_num_groups; + size_t dot_wgsize; public: @@ -37,8 +60,9 @@ class SYCLStream : public Stream virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; diff --git a/Stream.h b/Stream.h index 631e305..ff00a54 100644 --- a/Stream.h +++ b/Stream.h @@ -29,9 +29,10 @@ class Stream virtual void mul() = 0; virtual void add() = 0; virtual void triad() = 0; + virtual T dot() = 0; // Copy memory between host and device - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; + virtual void init_arrays(T initA, T initB, T initC) = 0; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; }; diff --git a/common.h.in b/common.h.in deleted file mode 100644 index 1b0f38b..0000000 --- a/common.h.in +++ /dev/null @@ -1,9 +0,0 @@ - -// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@" - diff --git a/main.cpp b/main.cpp index 6a15aa7..c1ca69f 100644 --- a/main.cpp +++ b/main.cpp @@ -15,7 +15,8 @@ #include #include -#include "common.h" +#define VERSION_STRING "devel" + #include "Stream.h" #if defined(CUDA) @@ -32,10 +33,8 @@ #include "ACCStream.h" #elif defined(SYCL) #include "SYCLStream.h" -#elif defined(OMP3) -#include "OMP3Stream.h" -#elif defined(OMP45) -#include "OMP45Stream.h" +#elif defined(OMP) +#include "OMPStream.h" #endif // Default size of 2^25 @@ -45,7 +44,7 @@ unsigned int deviceIndex = 0; bool use_float = false; template -void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c); +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum); template void run(); @@ -61,13 +60,11 @@ int main(int argc, char *argv[]) parseArguments(argc, argv); - // TODO: Fix SYCL to allow multiple template specializations -#ifndef SYCL + // TODO: Fix Kokkos to allow multiple template specializations #ifndef KOKKOS if (use_float) run(); else -#endif #endif run(); @@ -84,9 +81,9 @@ void run() std::cout << "Precision: double" << std::endl; // Create host vectors - std::vector a(ARRAY_SIZE, startA); - std::vector b(ARRAY_SIZE, startB); - std::vector c(ARRAY_SIZE, startC); + std::vector a(ARRAY_SIZE); + std::vector b(ARRAY_SIZE); + std::vector c(ARRAY_SIZE); std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" @@ -95,6 +92,9 @@ void run() << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; std::cout.precision(ss); + // Result of the Dot kernel + T sum; + Stream *stream; #if defined(CUDA) @@ -125,20 +125,16 @@ void run() // Use the SYCL implementation stream = new SYCLStream(ARRAY_SIZE, deviceIndex); -#elif defined(OMP3) - // Use the "reference" OpenMP 3 implementation - stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); - -#elif defined(OMP45) - // Use the "reference" OpenMP 3 implementation - stream = new OMP45Stream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); +#elif defined(OMP) + // Use the OpenMP implementation + stream = new OMPStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); #endif - stream->write_arrays(a, b, c); + stream->init_arrays(startA, startB, startC); // List of times - std::vector> timings(4); + std::vector> timings(5); // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; @@ -170,11 +166,17 @@ void run() t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); + // Execute Dot + t1 = std::chrono::high_resolution_clock::now(); + sum = stream->dot(); + t2 = std::chrono::high_resolution_clock::now(); + timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); + } // Check solutions stream->read_arrays(a, b, c); - check_solution(num_times, a, b, c); + check_solution(num_times, a, b, c, sum); // Display timing results std::cout @@ -186,15 +188,16 @@ void run() std::cout << std::fixed; - std::string labels[4] = {"Copy", "Mul", "Add", "Triad"}; - size_t sizes[4] = { + std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; + size_t sizes[5] = { 2 * sizeof(T) * ARRAY_SIZE, 2 * sizeof(T) * ARRAY_SIZE, 3 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE + 3 * sizeof(T) * ARRAY_SIZE, + 2 * sizeof(T) * ARRAY_SIZE }; - for (int i = 0; i < 4; i++) + for (int i = 0; i < 5; i++) { // Get min/max; ignore the first result auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); @@ -218,12 +221,13 @@ void run() } template -void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c) +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum) { // Generate correct solution T goldA = startA; T goldB = startB; T goldC = startC; + T goldSum = 0.0; const T scalar = startScalar; @@ -236,6 +240,9 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldA = goldB + scalar * goldC; } + // Do the reduction + goldSum = goldA * goldB * ARRAY_SIZE; + // Calculate the average error double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); errA /= a.size(); @@ -243,6 +250,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector errB /= b.size(); double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); errC /= c.size(); + double errSum = fabs(sum - goldSum); double epsi = std::numeric_limits::epsilon() * 100.0; @@ -258,6 +266,13 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector std::cerr << "Validation failed on c[]. Average error " << errC << std::endl; + // Check sum to 8 decimal places + if (errSum > 1.0E-8) + std::cerr + << "Validation failed on sum. Error " << errSum + << std::endl << std::setprecision(15) + << "Sum was " << sum << " but should be " << goldSum + << std::endl; }