diff --git a/.gitignore b/.gitignore index a989b77..4d2865c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,20 @@ +common.h + gpu-stream-cuda gpu-stream-ocl +gpu-stream-acc +gpu-stream-omp3 +gpu-stream-omp45 +gpu-stream-sycl + *.o - *.tar - *.gz + +.DS_Store + +CMakeCache.txt +CMakeFiles/ +cmake_install.cmake +Makefile diff --git a/ACCStream.cpp b/ACCStream.cpp new file mode 100644 index 0000000..d3fbd6a --- /dev/null +++ b/ACCStream.cpp @@ -0,0 +1,141 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "ACCStream.h" + +template +ACCStream::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) +{ + + acc_set_device_num(device, acc_device_nvidia); + + array_size = ARRAY_SIZE; + + // Set up data region on device + this->a = a; + this->b = b; + this->c = c; + #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +ACCStream::~ACCStream() +{ + // End data region on device + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void ACCStream::copy() +{ + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict c = this->c; + #pragma acc kernels present(a[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void ACCStream::mul() +{ + const T scalar = 0.3; + + unsigned int array_size = this->array_size; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc kernels present(b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void ACCStream::add() +{ + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void ACCStream::triad() +{ + const T scalar = 0.3; + + unsigned int array_size = this->array_size; + T * restrict a = this->a; + T * restrict b = this->b; + T * restrict c = this->c; + #pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} +void listDevices(void) +{ + // Get number of devices + int count = acc_get_num_devices(acc_device_nvidia); + + // Print device list + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << "There are " << count << " devices." << std::endl; + } +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class ACCStream; +template class ACCStream; diff --git a/ACCStream.h b/ACCStream.h new file mode 100644 index 0000000..48fea55 --- /dev/null +++ b/ACCStream.h @@ -0,0 +1,44 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "OpenACC" + +template +class ACCStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + ACCStream(const unsigned int, T*, T*, T*, int); + ~ACCStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + + + +}; diff --git a/CL/cl2.hpp b/CL/cl2.hpp index ad0c7c4..e0f55fe 100644 --- a/CL/cl2.hpp +++ b/CL/cl2.hpp @@ -28,11 +28,11 @@ /*! \file * - * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), + * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), * OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29) * \author Lee Howes and Bruce Merry - * - * Derived from the OpenCL 1.x C++ bindings written by + * + * Derived from the OpenCL 1.x C++ bindings written by * Benedict R. Gaster, Laurent Morichetti and Lee Howes * With additions and fixes from: * Brian Cole, March 3rd 2010 and April 2012 @@ -52,6 +52,18 @@ * #define CL_HPP_USE_DX_INTEROP * cl_khr_sub_groups * #define CL_HPP_USE_CL_SUB_GROUPS_KHR + * + * Doxygen documentation for this header is available here: + * + * http://khronosgroup.github.io/OpenCL-CLHPP/ + * + * The latest version of this header can be found on the GitHub releases page: + * + * https://github.com/KhronosGroup/OpenCL-CLHPP/releases + * + * Bugs and patches can be submitted to the GitHub repository: + * + * https://github.com/KhronosGroup/OpenCL-CLHPP */ /*! \mainpage @@ -134,41 +146,64 @@ * * \section parameterization Parameters * This header may be parameterized by a set of preprocessor macros. - * CL_HPP_TARGET_OPENCL_VERSION - * - Defines the target OpenCL runtime version to build the header against. - * Defaults to 200, representing OpenCL 2.0. - * CL_HPP_NO_STD_STRING - * - Do not use the standard library string class. - * cl::string is not defined and may be defined by the user before - * cl2.hpp is included. - * CL_HPP_NO_STD_VECTOR - * - Do not use the standard library vector class. - * cl::vector is not defined and may be defined by the user before - * cl2.hpp is included. - * CL_HPP_NO_STD_ARRAY - * - Do not use the standard library array class. - * cl::array is not defined and may be defined by the user before - * cl2.hpp is included. - * CL_HPP_NO_STD_UNIQUE_PTR - * - Do not use the standard library unique_ptr class. - * cl::pointer and the cl::allocate_pointer function are not defined - * and may be defined by the user before cl2.hpp is included. - * CL_HPP_ENABLE_DEVICE_FISSION - * - Enables device fission for OpenCL 1.2 platforms - * CL_HPP_ENABLE_EXCEPTIONS - * - Enable exceptions for use in the C++ bindings header. - * This is the preferred error handling mechanism but is not required. - * CL_HPP_ENABLE_SIZE_T_COMPATIBILITY - * - Backward compatibility option to support cl.hpp-style size_t class. - * Replaces the updated std::array derived version and removal of size_t - * from the namespace. Note that in this case the new size_t class - * is placed in the cl::compatibility namespace and thus requires - * an additional using declaration for direct backward compatibility. - * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY - * - Enable older vector of pairs interface for construction of programs. - * CL_HPP_CL_1_2_DEFAULT_BUILD - * - Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0 - * - applies to use of cl::Program construction and other program build variants. + * + * - CL_HPP_TARGET_OPENCL_VERSION + * + * Defines the target OpenCL runtime version to build the header + * against. Defaults to 200, representing OpenCL 2.0. + * + * - CL_HPP_NO_STD_STRING + * + * Do not use the standard library string class. cl::string is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_VECTOR + * + * Do not use the standard library vector class. cl::vector is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_ARRAY + * + * Do not use the standard library array class. cl::array is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_UNIQUE_PTR + * + * Do not use the standard library unique_ptr class. cl::pointer and + * the cl::allocate_pointer functions are not defined and may be + * defined by the user before cl2.hpp is included. + * + * - CL_HPP_ENABLE_DEVICE_FISSION + * + * Enables device fission for OpenCL 1.2 platforms. + * + * - CL_HPP_ENABLE_EXCEPTIONS + * + * Enable exceptions for use in the C++ bindings header. This is the + * preferred error handling mechanism but is not required. + * + * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY + * + * Backward compatibility option to support cl.hpp-style size_t + * class. Replaces the updated std::array derived version and + * removal of size_t from the namespace. Note that in this case the + * new size_t class is placed in the cl::compatibility namespace and + * thus requires an additional using declaration for direct backward + * compatibility. + * + * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY + * + * Enable older vector of pairs interface for construction of + * programs. + * + * - CL_HPP_CL_1_2_DEFAULT_BUILD + * + * Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0 + * applies to use of cl::Program construction and other program + * build variants. * * * \section example Example @@ -177,19 +212,19 @@ * bindings, including support for the optional exception feature and * also the supplied vector and string classes, see following sections for * decriptions of these features. - * + * * \code #define CL_HPP_ENABLE_EXCEPTIONS #define CL_HPP_TARGET_OPENCL_VERSION 200 - + #include #include #include #include #include - + const int numElements = 32; - + int main(void) { // Filter for a 2.0 platform and set it as the default @@ -212,35 +247,45 @@ std::cout << "Error setting default platform."; return -1; } - - std::string kernel1{ - "global int globalA;" - "kernel void updateGlobal(){" - " globalA = 75;" - "}"}; - std::string kernel2{ - "typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, global int *output, int val, write_only pipe int outPipe, queue_t childQueue){" - " output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);" - " write_pipe(outPipe, &val);" - " queue_t default_queue = get_default_queue(); " - " ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); " - // Have a child kernel write into third quarter of output - " enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, " - " ^{" - " output[get_global_size(0)*2 + get_global_id(0)] = inputA[get_global_size(0)*2+get_global_id(0)] + inputB[get_global_size(0)*2+get_global_id(0)] + globalA;" - " });" - // Have a child kernel write into last quarter of output - " enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, " - " ^{" - " output[get_global_size(0)*3 + get_global_id(0)] = inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;" - " });" - "}" }; + + // Use C++11 raw string literals for kernel source code + std::string kernel1{R"CLC( + global int globalA; + kernel void updateGlobal() + { + globalA = 75; + } + )CLC"}; + std::string kernel2{R"CLC( + typedef struct { global int *bar; } Foo; + kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, + global int *output, int val, write_only pipe int outPipe, queue_t childQueue) + { + output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar); + write_pipe(outPipe, &val); + queue_t default_queue = get_default_queue(); + ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); + + // Have a child kernel write into third quarter of output + enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, + ^{ + output[get_global_size(0)*2 + get_global_id(0)] = + inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA; + }); + + // Have a child kernel write into last quarter of output + enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, + ^{ + output[get_global_size(0)*3 + get_global_id(0)] = + inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2; + }); + } + )CLC"}; // New simpler string interface style std::vector programStrings {kernel1, kernel2}; - cl::Program vectorAddProgram( - programStrings); + cl::Program vectorAddProgram(programStrings); try { vectorAddProgram.build("-cl-std=CL2.0"); } @@ -251,7 +296,7 @@ for (auto &pair : buildInfo) { std::cerr << pair.second << std::endl << std::endl; } - + return 1; } @@ -264,17 +309,17 @@ program2Kernel( cl::EnqueueArgs( cl::NDRange(1))); - + ////////////////// // SVM allocations - - cl::pointer anSVMInt = cl::allocate_svm>(); + + auto anSVMInt = cl::allocate_svm>(); *anSVMInt = 5; - cl::SVMAllocator>> svmAllocReadOnly; + cl::SVMAllocator>> svmAllocReadOnly; auto fooPointer = cl::allocate_pointer(svmAllocReadOnly); fooPointer->bar = anSVMInt.get(); cl::SVMAllocator> svmAlloc; - std::vector>> inputA(numElements, 1, svmAlloc); + std::vector>> inputA(numElements, 1, svmAlloc); cl::coarse_svm_vector inputB(numElements, 2, svmAlloc); // @@ -284,7 +329,7 @@ std::vector output(numElements, 0xdeadbeef); cl::Buffer outputBuffer(begin(output), end(output), false); cl::Pipe aPipe(sizeof(cl_int), numElements / 2); - + // Default command queue, also passed in as a parameter cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault( cl::Context::getDefault(), cl::Device::getDefault()); @@ -339,7 +384,7 @@ return 0; } - * + * * \endcode * */ @@ -3538,7 +3583,7 @@ template cl::pointer> allocate_pointer(const Alloc &alloc_, Args&&... args) { Alloc alloc(alloc_); - static const size_t copies = 1; + static const size_type copies = 1; // Ensure that creation of the management block and the // object are dealt with separately such that we only provide a deleter @@ -6520,7 +6565,7 @@ inline cl_int cl::Program::getInfo(cl_program_info name, vectorresize(numBinaries); - for (int i = 0; i < numBinaries; ++i) { + for (size_type i = 0; i < numBinaries; ++i) { (*param)[i].resize(sizes[i]); } @@ -7107,7 +7152,7 @@ public: size_type buffer_slice_pitch, size_type host_row_pitch, size_type host_slice_pitch, - void *ptr, + const void *ptr, const vector* events = NULL, Event* event = NULL) const { @@ -7255,7 +7300,7 @@ public: const array& region, size_type row_pitch, size_type slice_pitch, - void* ptr, + const void* ptr, const vector* events = NULL, Event* event = NULL) const { @@ -7845,7 +7890,7 @@ public: CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( const Kernel& kernel, const vector* events = NULL, - Event* event = NULL) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED const + Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED { cl_event tmp; cl_int err = detail::errHandler( @@ -8873,7 +8918,7 @@ inline cl_int enqueueWriteBufferRect( size_type buffer_slice_pitch, size_type host_row_pitch, size_type host_slice_pitch, - void *ptr, + const void *ptr, const vector* events = NULL, Event* event = NULL) { @@ -8971,7 +9016,7 @@ inline cl_int enqueueWriteImage( const array& region, size_type row_pitch, size_type slice_pitch, - void* ptr, + const void* ptr, const vector* events = NULL, Event* event = NULL) { diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..acaa2a1 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,164 @@ + +cmake_minimum_required(VERSION 3.2) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +project(gpu-stream) + +include(CheckIncludeFileCXX) +include(CheckCXXCompilerFlag) + +set(gpu-stream_VERSION_MAJOR 2) +set(gpu-stream_VERSION_MINOR 0) + +configure_file(common.h.in common.h) + +# If using the Cray compiler, manually add the C++11 flag because setting the +# standard through CMake as above doesn't set this flag with Cray +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") + list(APPEND CMAKE_CXX_FLAGS -hstd=c++11) +endif () + +#------------------------------------------------------------------------------- +# CUDA +#------------------------------------------------------------------------------- +find_package(CUDA 7.0 QUIET) +set(FLAG True) +if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin") + execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION) + if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1") + message("Xcode version not supported by CUDA") + set(FLAG False) + endif () +endif () +if (${FLAG} AND ${CUDA_FOUND}) + list(APPEND CUDA_NVCC_FLAGS --std=c++11) + cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu) + target_compile_definitions(gpu-stream-cuda PUBLIC CUDA) +else () + message("Skipping CUDA...") +endif () + +#------------------------------------------------------------------------------- +# OpenCL +#------------------------------------------------------------------------------- +find_package(OpenCL QUIET) +if (${OpenCL_FOUND}) + add_executable(gpu-stream-ocl main.cpp OCLStream.cpp) + target_compile_definitions(gpu-stream-ocl PUBLIC OCL) + target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY}) +else () + message("Skipping OpenCL...") +endif () + +#------------------------------------------------------------------------------- +# OpenACC +#------------------------------------------------------------------------------- +# Check compiler supports an OpenACC flag +include(CheckCXXCompilerFlag) +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") + CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC) + if (OPENACC) + list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc) + endif () +elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI") + CHECK_CXX_COMPILER_FLAG(-acc OPENACC) +elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") + CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC) +endif () + +if (OPENACC) + add_executable(gpu-stream-acc main.cpp ACCStream.cpp) + target_compile_definitions(gpu-stream-acc PUBLIC ACC) +else () + message("Skipping OpenACC...") +endif () + +#------------------------------------------------------------------------------- +# OpenMP 3.0 +#------------------------------------------------------------------------------- +find_package(OpenMP QUIET) +if (${OpenMP_FOUND}) + add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp) + target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3) +else () + message("Skipping OpenMP 3...") +endif () + +#------------------------------------------------------------------------------- +# OpenMP 4.5 +#------------------------------------------------------------------------------- +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5) + add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp) + target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45) + endif () +endif () + +#------------------------------------------------------------------------------- +# RAJA +#------------------------------------------------------------------------------- + +if (RAJA_PATH) + find_package(OpenMP) + find_package(CUDA 7.5) + list(APPEND CUDA_NVCC_FLAGS "-arch compute_35") + list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda) + list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA) + cuda_include_directories(${RAJA_PATH}/include) + set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) + cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp) + target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA) + target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA") +else() + message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)") +endif() + +#------------------------------------------------------------------------------- +# Kokkos +#------------------------------------------------------------------------------- +if (KOKKOS_PATH) + if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") + add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH}) + else() + message("Skipping Kokkos (requires Linux)") + endif() +else() + message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)") +endif() + +#------------------------------------------------------------------------------- +# SYCL +#------------------------------------------------------------------------------- +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR + "${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") + # Use C++14 if available, otherwise drop back to C++11 + check_cxx_compiler_flag("-std=c++14" CXX14) + if (CXX14) + set(CMAKE_REQUIRED_FLAGS "-std=c++14") + else() + set(CMAKE_REQUIRED_FLAGS "-std=c++11") + endif() +endif() + +check_include_file_cxx("CL/sycl.hpp" HAS_SYCL) +if (HAS_SYCL) + add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp) + target_compile_definitions(gpu-stream-sycl PUBLIC SYCL) + + find_program(COMPUTECPP "compute++") + if (COMPUTECPP) + message(STATUS "Using ComputeCpp for SYCL compilation") + add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c) + add_dependencies(gpu-stream-sycl SYCLStream.sycl) + target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl) + target_link_libraries(gpu-stream-sycl SYCL OpenCL) + else() + message(STATUS "Using header-only SYCL implementation") + set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14) + endif(COMPUTECPP) +else () + message("Skipping SYCL...") +endif (HAS_SYCL) diff --git a/CUDAStream.cu b/CUDAStream.cu new file mode 100644 index 0000000..21d36fa --- /dev/null +++ b/CUDAStream.cu @@ -0,0 +1,214 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include "CUDAStream.h" + +#define TBSIZE 1024 + +void check_error(void) +{ + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Error: " << cudaGetErrorString(err) << std::endl; + exit(err); + } +} + +template +CUDAStream::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index) +{ + + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // Set device + int count; + cudaGetDeviceCount(&count); + check_error(); + if (device_index >= count) + throw std::runtime_error("Invalid device index"); + cudaSetDevice(device_index); + check_error(); + + // Print out device information + std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + array_size = ARRAY_SIZE; + + // Check buffers fit on the device + cudaDeviceProp props; + cudaGetDeviceProperties(&props, 0); + if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // Create device buffers + cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + check_error(); + cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + check_error(); + cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + check_error(); +} + + +template +CUDAStream::~CUDAStream() +{ + cudaFree(d_a); + check_error(); + cudaFree(d_b); + check_error(); + cudaFree(d_c); + check_error(); +} + +template +void CUDAStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + // Copy host memory to device + cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice); + check_error(); + cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice); + check_error(); + cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice); + check_error(); +} + +template +void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + // Copy device memory to host + cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); +} + + +template +__global__ void copy_kernel(const T * a, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i]; +} + +template +void CUDAStream::copy() +{ + copy_kernel<<>>(d_a, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void mul_kernel(T * b, const T * c) +{ + const T scalar = 0.3; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + b[i] = scalar * c[i]; +} + +template +void CUDAStream::mul() +{ + mul_kernel<<>>(d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void add_kernel(const T * a, const T * b, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i] + b[i]; +} + +template +void CUDAStream::add() +{ + add_kernel<<>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void triad_kernel(T * a, const T * b, const T * c) +{ + const T scalar = 0.3; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = b[i] + scalar * c[i]; +} + +template +void CUDAStream::triad() +{ + triad_kernel<<>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + + +void listDevices(void) +{ + // Get number of devices + int count; + cudaGetDeviceCount(&count); + check_error(); + + // Print device names + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + + +std::string getDeviceName(const int device) +{ + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device); + check_error(); + return std::string(props.name); +} + + +std::string getDeviceDriver(const int device) +{ + cudaSetDevice(device); + check_error(); + int driver; + cudaDriverGetVersion(&driver); + check_error(); + return std::to_string(driver); +} + +template class CUDAStream; +template class CUDAStream; diff --git a/CUDAStream.h b/CUDAStream.h new file mode 100644 index 0000000..6904a86 --- /dev/null +++ b/CUDAStream.h @@ -0,0 +1,43 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "CUDA" + +template +class CUDAStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers to arrays + T *d_a; + T *d_b; + T *d_c; + + + public: + + CUDAStream(const unsigned int, const int); + ~CUDAStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp new file mode 100644 index 0000000..d73f7d5 --- /dev/null +++ b/KOKKOSStream.cpp @@ -0,0 +1,142 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include "KOKKOSStream.hpp" + +using namespace Kokkos; + +template +KOKKOSStream::KOKKOSStream( + const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + Kokkos::initialize(); + + d_a = new View("d_a", ARRAY_SIZE); + d_b = new View("d_b", ARRAY_SIZE); + d_c = new View("d_c", ARRAY_SIZE); + hm_a = new View::HostMirror(); + hm_b = new View::HostMirror(); + hm_c = new View::HostMirror(); + *hm_a = create_mirror_view(*d_a); + *hm_b = create_mirror_view(*d_b); + *hm_c = create_mirror_view(*d_c); +} + +template +KOKKOSStream::~KOKKOSStream() +{ + finalize(); +} + +template +void KOKKOSStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) +{ + for(int ii = 0; ii < array_size; ++ii) + { + (*hm_a)(ii) = a[ii]; + (*hm_b)(ii) = b[ii]; + (*hm_c)(ii) = c[ii]; + } + deep_copy(*d_a, *hm_a); + deep_copy(*d_b, *hm_b); + deep_copy(*d_c, *hm_c); +} + +template +void KOKKOSStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) +{ + deep_copy(*hm_a, *d_a); + deep_copy(*hm_b, *d_b); + deep_copy(*hm_c, *d_c); + for(int ii = 0; ii < array_size; ++ii) + { + a[ii] = (*hm_a)(ii); + b[ii] = (*hm_b)(ii); + c[ii] = (*hm_c)(ii); + } +} + +template +void KOKKOSStream::copy() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + c[index] = a[index]; + }); + Kokkos::fence(); +} + +template +void KOKKOSStream::mul() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + const T scalar = 0.3; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + b[index] = scalar*c[index]; + }); + Kokkos::fence(); +} + +template +void KOKKOSStream::add() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + c[index] = a[index] + b[index]; + }); + Kokkos::fence(); +} + +template +void KOKKOSStream::triad() +{ + View a(*d_a); + View b(*d_b); + View c(*d_c); + + const T scalar = 0.3; + parallel_for(array_size, KOKKOS_LAMBDA (const int index) + { + a[index] = b[index] + scalar*c[index]; + }); + Kokkos::fence(); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "Kokkos"; +} + + +std::string getDeviceDriver(const int device) +{ + return "Kokkos"; +} + +//template class KOKKOSStream; +template class KOKKOSStream; diff --git a/KOKKOSStream.hpp b/KOKKOSStream.hpp new file mode 100644 index 0000000..d2b9665 --- /dev/null +++ b/KOKKOSStream.hpp @@ -0,0 +1,56 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "KOKKOS" + +#ifdef KOKKOS_TARGET_CPU + #define DEVICE Kokkos::OpenMP +#else + #define DEVICE Kokkos::Cuda +#endif + +template +class KOKKOSStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers to arrays + Kokkos::View* d_a; + Kokkos::View* d_b; + Kokkos::View* d_c; + Kokkos::View::HostMirror* hm_a; + Kokkos::View::HostMirror* hm_b; + Kokkos::View::HostMirror* hm_c; + + public: + + KOKKOSStream(const unsigned int, const int); + ~KOKKOSStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + diff --git a/KokkosMakefile b/KokkosMakefile new file mode 100644 index 0000000..83e00b9 --- /dev/null +++ b/KokkosMakefile @@ -0,0 +1,11 @@ + +default: gpu-stream-kokkos + +include $(KOKKOS_PATH)/Makefile.kokkos + +gpu-stream-kokkos: main.o KOKKOSStream.o + $(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS + diff --git a/LICENSE b/LICENSE index 70cebc1..1bc1114 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ *============================================================================== *------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC +* Copyright 2015-16: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC * Based on John D. McCalpin’s original STREAM benchmark for CPUs *------------------------------------------------------------------------------ * License: diff --git a/Makefile b/Makefile deleted file mode 100644 index c2bc9b2..0000000 --- a/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -LDLIBS = -l OpenCL -CXXFLAGS = -std=c++11 -O3 - -PLATFORM = $(shell uname -s) -ifeq ($(PLATFORM), Darwin) - LDLIBS = -framework OpenCL -endif - -all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip - - -gpu-stream-ocl: ocl-stream.cpp common.o Makefile - $(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS) - -common.o: common.cpp common.h Makefile - -gpu-stream-cuda: cuda-stream.cu common.o Makefile -ifeq ($(shell which nvcc > /dev/null; echo $$?), 0) - nvcc $(CXXFLAGS) common.o $< -o $@ -else - $(error "Cannot find nvcc, please install CUDA toolkit") -endif -HIP_PATH?=../../.. -HIPCC=$(HIP_PATH)/bin/hipcc - -hip-stream.o : hip-stream.cpp - $(HIPCC) $(CXXFLAGS) -c $< -o $@ - -gpu-stream-hip: hip-stream.o common.o Makefile -ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) - $(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@ -else - $(error "Cannot find $(HIPCC), please install HIP toolkit") -endif - - -.PHONY: clean - -clean: - rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o - diff --git a/OCLStream.cpp b/OCLStream.cpp new file mode 100644 index 0000000..50ad543 --- /dev/null +++ b/OCLStream.cpp @@ -0,0 +1,255 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "OCLStream.h" + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +std::string kernels{R"CLC( + + constant TYPE scalar = 0.3; + + kernel void copy( + global const TYPE * restrict a, + global TYPE * restrict c) + { + const size_t i = get_global_id(0); + c[i] = a[i]; + } + + kernel void mul( + global TYPE * restrict b, + global const TYPE * restrict c) + { + const size_t i = get_global_id(0); + b[i] = scalar * c[i]; + } + + kernel void add( + global const TYPE * restrict a, + global const TYPE * restrict b, + global TYPE * restrict c) + { + const size_t i = get_global_id(0); + c[i] = a[i] + b[i]; + } + + kernel void triad( + global TYPE * restrict a, + global const TYPE * restrict b, + global const TYPE * restrict c) + { + const size_t i = get_global_id(0); + a[i] = b[i] + scalar * c[i]; + } + +)CLC"}; + + +template +OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) +{ + if (!cached) + getDeviceList(); + + // Setup default OpenCL GPU + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + device = devices[device_index]; + + // Print out device information + std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + context = cl::Context(device); + queue = cl::CommandQueue(context); + + // Create program + cl::Program program(context, kernels); + if (sizeof(T) == sizeof(double)) + { + // Check device can do double + if (!device.getInfo()) + throw std::runtime_error("Device does not support double precision, please use --float"); + program.build("-DTYPE=double"); + } + else if (sizeof(T) == sizeof(float)) + program.build("-DTYPE=float"); + + // Create kernels + copy_kernel = new cl::KernelFunctor(program, "copy"); + mul_kernel = new cl::KernelFunctor(program, "mul"); + add_kernel = new cl::KernelFunctor(program, "add"); + triad_kernel = new cl::KernelFunctor(program, "triad"); + + array_size = ARRAY_SIZE; + + // Check buffers fit on the device + cl_ulong totalmem = device.getInfo(); + cl_ulong maxbuffer = device.getInfo(); + if (maxbuffer < sizeof(T)*ARRAY_SIZE) + throw std::runtime_error("Device cannot allocate a buffer big enough"); + if (totalmem < 3*sizeof(T)*ARRAY_SIZE) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // Create buffers + d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + +} + +template +OCLStream::~OCLStream() +{ + delete copy_kernel; + delete mul_kernel; + delete add_kernel; + delete triad_kernel; +} + +template +void OCLStream::copy() +{ + (*copy_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_c + ); + queue.finish(); +} + +template +void OCLStream::mul() +{ + (*mul_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_b, d_c + ); + queue.finish(); +} + +template +void OCLStream::add() +{ + (*add_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c + ); + queue.finish(); +} + +template +void OCLStream::triad() +{ + (*triad_kernel)( + cl::EnqueueArgs(queue, cl::NDRange(array_size)), + d_a, d_b, d_c + ); + queue.finish(); +} + +template +void OCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + cl::copy(queue, a.begin(), a.end(), d_a); + cl::copy(queue, b.begin(), b.end(), d_b); + cl::copy(queue, c.begin(), c.end(), d_c); +} + +template +void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + cl::copy(queue, d_a, a.begin(), a.end()); + cl::copy(queue, d_b, b.begin(), b.end()); + cl::copy(queue, d_c, c.begin(), c.end()); +} + +void getDeviceList(void) +{ + // Get list of platforms + std::vector platforms; + cl::Platform::get(&platforms); + + // Enumerate devices + for (unsigned i = 0; i < platforms.size(); i++) + { + std::vector plat_devices; + platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); + devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); + } + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } + + +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + cl_device_info info = CL_DEVICE_NAME; + + if (device < devices.size()) + { + devices[device].getInfo(info, &name); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; + +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + devices[device].getInfo(CL_DRIVER_VERSION, &driver); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + + +template class OCLStream; +template class OCLStream; diff --git a/OCLStream.h b/OCLStream.h new file mode 100644 index 0000000..cb48da5 --- /dev/null +++ b/OCLStream.h @@ -0,0 +1,61 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#define CL_HPP_ENABLE_EXCEPTIONS +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 + +#include "CL/cl2.hpp" + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "OpenCL" + +template +class OCLStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers to arrays + cl::Buffer d_a; + cl::Buffer d_b; + cl::Buffer d_c; + + // OpenCL objects + cl::Device device; + cl::Context context; + cl::CommandQueue queue; + + cl::KernelFunctor *copy_kernel; + cl::KernelFunctor * mul_kernel; + cl::KernelFunctor *add_kernel; + cl::KernelFunctor *triad_kernel; + + public: + + OCLStream(const unsigned int, const int); + ~OCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp new file mode 100644 index 0000000..fe8323a --- /dev/null +++ b/OMP3Stream.cpp @@ -0,0 +1,111 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "OMP3Stream.h" + +template +OMP3Stream::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c) +{ + array_size = ARRAY_SIZE; + this->a = (T*)malloc(sizeof(T)*array_size); + this->b = (T*)malloc(sizeof(T)*array_size); + this->c = (T*)malloc(sizeof(T)*array_size); +} + +template +OMP3Stream::~OMP3Stream() +{ + free(a); + free(b); + free(c); +} + + +template +void OMP3Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = h_a[i]; + b[i] = h_b[i]; + c[i] = h_c[i]; + } +} + +template +void OMP3Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } +} + +template +void OMP3Stream::copy() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void OMP3Stream::mul() +{ + const T scalar = 0.3; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void OMP3Stream::add() +{ + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void OMP3Stream::triad() +{ + const T scalar = 0.3; + #pragma omp parallel for + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} + +void listDevices(void) +{ + std::cout << "0: CPU" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + + +template class OMP3Stream; +template class OMP3Stream; diff --git a/OMP3Stream.h b/OMP3Stream.h new file mode 100644 index 0000000..0f14300 --- /dev/null +++ b/OMP3Stream.h @@ -0,0 +1,40 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Reference OpenMP" + +template +class OMP3Stream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + // Device side pointers + T *a; + T *b; + T *c; + + public: + OMP3Stream(const unsigned int, T*, T*, T*); + ~OMP3Stream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp new file mode 100644 index 0000000..f849c39 --- /dev/null +++ b/OMP45Stream.cpp @@ -0,0 +1,140 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "OMP45Stream.h" + +template +OMP45Stream::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device) +{ + omp_set_default_device(device); + + array_size = ARRAY_SIZE; + + // Set up data region on device + this->a = a; + this->b = b; + this->c = c; + #pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +OMP45Stream::~OMP45Stream() +{ + // End data region on device + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void OMP45Stream::write_arrays(const std::vector& h_a, const std::vector& h_b, const std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void OMP45Stream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size]) + {} +} + +template +void OMP45Stream::copy() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *c = this->c; + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size]) + for (int i = 0; i < array_size; i++) + { + c[i] = a[i]; + } +} + +template +void OMP45Stream::mul() +{ + const T scalar = 0.3; + + unsigned int array_size = this->array_size; + T *b = this->b; + T *c = this->c; + #pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size]) + for (int i = 0; i < array_size; i++) + { + b[i] = scalar * c[i]; + } +} + +template +void OMP45Stream::add() +{ + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + for (int i = 0; i < array_size; i++) + { + c[i] = a[i] + b[i]; + } +} + +template +void OMP45Stream::triad() +{ + const T scalar = 0.3; + + unsigned int array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size]) + for (int i = 0; i < array_size; i++) + { + a[i] = b[i] + scalar * c[i]; + } +} +void listDevices(void) +{ + // Get number of devices + int count = omp_get_num_devices(); + + // Print device list + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << "There are " << count << " devices." << std::endl; + } +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class OMP45Stream; +template class OMP45Stream; diff --git a/OMP45Stream.h b/OMP45Stream.h new file mode 100644 index 0000000..bd812a1 --- /dev/null +++ b/OMP45Stream.h @@ -0,0 +1,45 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +#include "Stream.h" + +#include + +#define IMPLEMENTATION_STRING "OpenMP 4.5" + +template +class OMP45Stream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Device side pointers + T *a; + T *b; + T *c; + + public: + OMP45Stream(const unsigned int, T*, T*, T*, int); + ~OMP45Stream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + + + +}; diff --git a/RAJAStream.cpp b/RAJAStream.cpp new file mode 100644 index 0000000..e418f09 --- /dev/null +++ b/RAJAStream.cpp @@ -0,0 +1,130 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "RAJAStream.hpp" + +using RAJA::forall; +using RAJA::RangeSegment; + +template +RAJAStream::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index) + : array_size(ARRAY_SIZE) +{ + RangeSegment seg(0, ARRAY_SIZE); + index_set.push_back(seg); + +#ifdef RAJA_TARGET_CPU + d_a = new T[ARRAY_SIZE]; + d_b = new T[ARRAY_SIZE]; + d_c = new T[ARRAY_SIZE]; +#else + cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaDeviceSynchronize(); +#endif +} + +template +RAJAStream::~RAJAStream() +{ +#ifdef RAJA_TARGET_CPU + delete[] d_a; + delete[] d_b; + delete[] d_c; +#else + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); +#endif +} + +template +void RAJAStream::write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) +{ + std::copy(a.begin(), a.end(), d_a); + std::copy(b.begin(), b.end(), d_b); + std::copy(c.begin(), c.end(), d_c); +} + +template +void RAJAStream::read_arrays( + std::vector& a, std::vector& b, std::vector& c) +{ + std::copy(d_a, d_a + array_size, a.data()); + std::copy(d_b, d_b + array_size, b.data()); + std::copy(d_c, d_c + array_size, c.data()); +} + +template +void RAJAStream::copy() +{ + T* a = d_a; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + c[index] = a[index]; + }); +} + +template +void RAJAStream::mul() +{ + T* b = d_b; + T* c = d_c; + const T scalar = 0.3; + forall(index_set, [=] RAJA_DEVICE (int index) + { + b[index] = scalar*c[index]; + }); +} + +template +void RAJAStream::add() +{ + T* a = d_a; + T* b = d_b; + T* c = d_c; + forall(index_set, [=] RAJA_DEVICE (int index) + { + c[index] = a[index] + b[index]; + }); +} + +template +void RAJAStream::triad() +{ + T* a = d_a; + T* b = d_b; + T* c = d_c; + const T scalar = 0.3; + forall(index_set, [=] RAJA_DEVICE (int index) + { + a[index] = b[index] + scalar*c[index]; + }); +} + +void listDevices(void) +{ + std::cout << "This is not the device you are looking for."; +} + + +std::string getDeviceName(const int device) +{ + return "RAJA"; +} + + +std::string getDeviceDriver(const int device) +{ + return "RAJA"; +} + +template class RAJAStream; +template class RAJAStream; diff --git a/RAJAStream.hpp b/RAJAStream.hpp new file mode 100644 index 0000000..454e20e --- /dev/null +++ b/RAJAStream.hpp @@ -0,0 +1,58 @@ +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include "RAJA/RAJA.hxx" + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "RAJA" + +#ifdef RAJA_TARGET_CPU +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::omp_parallel_for_exec> policy; +#else +const size_t block_size = 128; +typedef RAJA::IndexSet::ExecPolicy< + RAJA::seq_segit, + RAJA::cuda_exec> policy; +#endif + +template +class RAJAStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // Contains iteration space + RAJA::IndexSet index_set; + + // Device side pointers to arrays + T* d_a; + T* d_b; + T* d_c; + + public: + + RAJAStream(const unsigned int, const int); + ~RAJAStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays( + const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays( + std::vector& a, std::vector& b, std::vector& c) override; +}; + diff --git a/README.md b/README.md index f54874e..9c72f2e 100644 --- a/README.md +++ b/README.md @@ -6,24 +6,26 @@ This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time. +There are multiple implementations of this benchmark in a variety of programming models. +Currently implemented are: + - OpenCL + - CUDA + - OpenACC + - OpenMP 3 and 4.5 + - Kokkos + - RAJA + - SYCL + Usage ----- -Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5) +CMake 3.2 or above is required. +Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports. -Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda` +Generate the Makefile with `cmake .` -For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: -http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ -Install the HCC compiler: -https://bitbucket.org/multicoreware/hcc/wiki/Home -Install HIP: -https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP - -Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' - -Android -------- +Android (outdated instructions) +------------------ Assuming you have a recent Android NDK available, you can use the toolchain that it provides to build GPU-STREAM. You should first @@ -57,6 +59,7 @@ Run GPU-STREAM from an adb shell: # Use float if device doesn't support double, and reduce array size ./gpu-stream-ocl --float -n 6 -s 10000000 + Results ------- diff --git a/SYCLStream.cpp b/SYCLStream.cpp new file mode 100644 index 0000000..d039d70 --- /dev/null +++ b/SYCLStream.cpp @@ -0,0 +1,231 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "SYCLStream.h" + +#include + +using namespace cl::sycl; + +#define WGSIZE 64 + +// Cache list of devices +bool cached = false; +std::vector devices; +void getDeviceList(void); + +template +SYCLStream::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index) +{ + if (!cached) + getDeviceList(); + + // The array size must be divisible by WGSIZE + if (ARRAY_SIZE % WGSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << WGSIZE; + throw std::runtime_error(ss.str()); + } + + array_size = ARRAY_SIZE; + + if (device_index >= devices.size()) + throw std::runtime_error("Invalid device index"); + device dev = devices[device_index]; + + // Print out device information + std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + queue = new cl::sycl::queue(dev); + + // Create buffers + d_a = new buffer(array_size); + d_b = new buffer(array_size); + d_c = new buffer(array_size); +} + +template +SYCLStream::~SYCLStream() +{ + delete d_a; + delete d_b; + delete d_c; + + delete queue; +} + +template +void SYCLStream::copy() +{ + queue->submit([&](handler &cgh) + { + auto ka = d_a->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + { + kc[item.get_global()] = ka[item.get_global()]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::mul() +{ + const T scalar = 0.3; + queue->submit([&](handler &cgh) + { + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + { + kb[item.get_global()] = scalar * kc[item.get_global()]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::add() +{ + queue->submit([&](handler &cgh) + { + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + { + kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::triad() +{ + const T scalar = 0.3; + queue->submit([&](handler &cgh) + { + auto ka = d_a->template get_access(cgh); + auto kb = d_b->template get_access(cgh); + auto kc = d_c->template get_access(cgh); + cgh.parallel_for(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item) + { + ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()]; + }); + }); + queue->wait(); +} + +template +void SYCLStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + auto _a = d_a->template get_access(); + auto _b = d_b->template get_access(); + auto _c = d_c->template get_access(); + for (int i = 0; i < array_size; i++) + { + _a[i] = a[i]; + _b[i] = b[i]; + _c[i] = c[i]; + } +} + +template +void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + auto _a = d_a->template get_access(); + auto _b = d_b->template get_access(); + auto _c = d_c->template get_access(); + for (int i = 0; i < array_size; i++) + { + a[i] = _a[i]; + b[i] = _b[i]; + c[i] = _c[i]; + } +} + +void getDeviceList(void) +{ + // Get list of platforms + std::vector platforms = platform::get_platforms(); + + // Enumerate devices + for (unsigned i = 0; i < platforms.size(); i++) + { + std::vector plat_devices = platforms[i].get_devices(); + devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); + } + cached = true; +} + +void listDevices(void) +{ + getDeviceList(); + + // Print device names + if (devices.size() == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < devices.size(); i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + +std::string getDeviceName(const int device) +{ + if (!cached) + getDeviceList(); + + std::string name; + + if (device < devices.size()) + { + name = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for name for non-existant device"); + } + + return name; +} + +std::string getDeviceDriver(const int device) +{ + if (!cached) + getDeviceList(); + + std::string driver; + + if (device < devices.size()) + { + driver = devices[device].get_info(); + } + else + { + throw std::runtime_error("Error asking for driver for non-existant device"); + } + + return driver; +} + + +// TODO: Fix kernel names to allow multiple template specializations +//template class SYCLStream; +template class SYCLStream; diff --git a/SYCLStream.h b/SYCLStream.h new file mode 100644 index 0000000..8bc515d --- /dev/null +++ b/SYCLStream.h @@ -0,0 +1,47 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include + +#include "Stream.h" + +#include "CL/sycl.hpp" + +#define IMPLEMENTATION_STRING "SYCL" + +template +class SYCLStream : public Stream +{ + protected: + // Size of arrays + unsigned int array_size; + + // SYCL objects + cl::sycl::queue *queue; + cl::sycl::buffer *d_a; + cl::sycl::buffer *d_b; + cl::sycl::buffer *d_c; + + public: + + SYCLStream(const unsigned int, const int); + ~SYCLStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; + +// Populate the devices list +void getDeviceList(void); diff --git a/Stream.h b/Stream.h new file mode 100644 index 0000000..671289e --- /dev/null +++ b/Stream.h @@ -0,0 +1,37 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include + +template +class Stream +{ + public: + + virtual ~Stream(){} + + // Kernels + // These must be blocking calls + virtual void copy() = 0; + virtual void mul() = 0; + virtual void add() = 0; + virtual void triad() = 0; + + // Copy memory between host and device + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) = 0; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; + +}; + + +// Implementation specific device functions +void listDevices(void); +std::string getDeviceName(const int); +std::string getDeviceDriver(const int); diff --git a/common.cpp b/common.cpp deleted file mode 100644 index 3a6c56f..0000000 --- a/common.cpp +++ /dev/null @@ -1,165 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - -#include "common.h" - -// Default array size 50 * 2^20 (50*8 Mebibytes double precision) -// Use binary powers of two so divides 1024 -unsigned int ARRAY_SIZE = 52428800; -size_t ARRAY_PAD_BYTES = 0; - -unsigned int NTIMES = 10; - -bool useFloat = false; -unsigned int groups = 0; -unsigned int groupSize = 1024; - -unsigned int deviceIndex = 0; - -int parseUInt(const char *str, unsigned int *output) -{ - char *next; - *output = strtoul(str, &next, 10); - return !strlen(next); -} - -int parseSize(const char *str, size_t *output) -{ - char *next; - *output = strtoull(str, &next, 0); - int l = strlen(str); - if (l) { - char c = str[l-1]; // last char. - if ((c == 'k') || (c == 'K')) { - *output *= 1024; - } - if ((c == 'm') || (c == 'M')) { - *output *= (1024*1024); - } - - } - return !strlen(next); -} - - -void parseArguments(int argc, char *argv[]) -{ - for (int i = 1; i < argc; i++) - { - if (!strcmp(argv[i], "--list")) - { - listDevices(); - exit(0); - } - else if (!strcmp(argv[i], "--device")) - { - if (++i >= argc || !parseUInt(argv[i], &deviceIndex)) - { - std::cout << "Invalid device index" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s")) - { - if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE)) - { - std::cout << "Invalid array size" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n")) - { - if (++i >= argc || !parseUInt(argv[i], &NTIMES)) - { - std::cout << "Invalid number of times" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--groups")) - { - if (++i >= argc || !parseUInt(argv[i], &groups)) - { - std::cout << "Invalid group number" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--groupSize")) - { - if (++i >= argc || !parseUInt(argv[i], &groupSize)) - { - std::cout << "Invalid group size" << std::endl; - exit(1); - } - } - else if (!strcmp(argv[i], "--pad")) - { - if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES)) - { - std::cout << "Invalid size" << std::endl; - exit(1); - } - - } - else if (!strcmp(argv[i], "--float")) - { - useFloat = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision, not apply to AMD device" << std::endl; - } - else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) - { - std::cout << std::endl; - std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl; - std::cout << "Options:" << std::endl; - std::cout << " -h --help Print the message" << std::endl; - std::cout << " --list List available devices" << std::endl; - std::cout << " --device INDEX Select device at INDEX" << std::endl; - std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; - std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; - std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl; - std::cout << " --groupSize Set size of each group (default 1024)" << std::endl; - std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl; - std::cout << " --float Use floats (rather than doubles)" << std::endl; - std::cout << std::endl; - exit(0); - } - else - { - std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')" - << std::endl; - exit(1); - } - } -} diff --git a/common.h b/common.h deleted file mode 100644 index 9cf61d7..0000000 --- a/common.h +++ /dev/null @@ -1,115 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - -#include -#include -#include -#include -#include -#include - -#define VERSION_STRING "1.0" - -extern void parseArguments(int argc, char *argv[]); - -extern void listDevices(void); - -extern unsigned int ARRAY_SIZE; -extern size_t ARRAY_PAD_BYTES; -extern unsigned int NTIMES; - -extern unsigned int groups; -extern unsigned int groupSize; -extern bool useFloat; - -extern unsigned int deviceIndex; - - -template < typename T > -void check_solution(void* a_in, void* b_in, void* c_in) -{ - // Generate correct solution - T golda = 1.0; - T goldb = 2.0; - T goldc = 0.0; - - T * a = static_cast(a_in); - T * b = static_cast(b_in); - T * c = static_cast(c_in); - - const T scalar = 3.0; - - for (unsigned int i = 0; i < NTIMES; i++) - { - // Double - goldc = golda; - goldb = scalar * goldc; - goldc = golda + goldb; - golda = goldb + scalar * goldc; - } - - // Calculate average error - double erra = 0.0; - double errb = 0.0; - double errc = 0.0; - - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - erra += fabs(a[i] - golda); - errb += fabs(b[i] - goldb); - errc += fabs(c[i] - goldc); - } - - erra /= ARRAY_SIZE; - errb /= ARRAY_SIZE; - errc /= ARRAY_SIZE; - - double epsi = std::numeric_limits::epsilon() * 100; - - if (erra > epsi) - std::cout - << "Validation failed on a[]. Average error " << erra - << std::endl; - if (errb > epsi) - std::cout - << "Validation failed on b[]. Average error " << errb - << std::endl; - if (errc > epsi) - std::cout - << "Validation failed on c[]. Average error " << errc - << std::endl; -} - diff --git a/common.h.in b/common.h.in new file mode 100644 index 0000000..1b0f38b --- /dev/null +++ b/common.h.in @@ -0,0 +1,9 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@" + diff --git a/cuda-stream.cu b/cuda-stream.cu deleted file mode 100644 index ea067fb..0000000 --- a/cuda-stream.cu +++ /dev/null @@ -1,507 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#include -#include -#include -#include -#include -#include - -#include -#include "common.h" - -std::string getDeviceName(int device); -int getDriver(void); - -// Code to check CUDA errors -void check_cuda_error(void) -{ - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - std::cerr - << "Error: " - << cudaGetErrorString(err) - << std::endl; - exit(err); - } -} - -// looper function place more work inside each work item. -// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations -template -__global__ void -copy_looper(const T * a, T * c, int ARRAY_SIZE) -{ - int offset = (blockDim.x * blockIdx.x + threadIdx.x); - int stride = blockDim.x * gridDim.x; - - for (int i=offset; i -__global__ void -mul_looper(T * b, const T * c, int ARRAY_SIZE) -{ - int offset = blockDim.x * blockIdx.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - const T scalar = 3.0; - - for (int i=offset; i -__global__ void -add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE) -{ - int offset = blockDim.x * blockIdx.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (int i=offset; i -__global__ void -triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE) -{ - int offset = blockDim.x * blockIdx.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - const T scalar = 3.0; - - for (int i=offset; i -__global__ void copy(const T * a, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i]; -} - -template -__global__ void mul(T * b, const T * c) -{ - const T scalar = 3.0; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - b[i] = scalar * c[i]; -} - -template -__global__ void add(const T * a, const T * b, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i] + b[i]; -} - -template -__global__ void triad(T * a, const T * b, const T * c) -{ - const T scalar = 3.0; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - a[i] = b[i] + scalar * c[i]; -} - -int main(int argc, char *argv[]) -{ - - // Print out run information - std::cout - << "GPU-STREAM" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: CUDA" << std::endl; - - parseArguments(argc, argv); - - if (NTIMES < 2) - throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); - - // Config grid size and group size for kernel launching - int gridSize; - if (groups) { - gridSize = groups * groupSize; - } else { - gridSize = ARRAY_SIZE; - } - - float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; - std::cout << "GridSize: " << gridSize << " work-items" << std::endl; - std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; - std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; - if (groups) std::cout << "Using looper kernels:" << std::endl; - - std::cout << "Precision: "; - if (useFloat) std::cout << "float"; - else std::cout << "double"; - std::cout << std::endl << std::endl; - - std::cout << "Running kernels " << NTIMES << " times" << std::endl; - - if (ARRAY_SIZE % 1024 != 0) - { - unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; - ARRAY_SIZE -= ARRAY_SIZE % 1024; - std::cout - << "Warning: array size must divide 1024" << std::endl - << "Resizing array from " << OLD_ARRAY_SIZE - << " to " << ARRAY_SIZE << std::endl; - if (ARRAY_SIZE == 0) - throw std::runtime_error("Array size must be >= 1024"); - } - - // Get precision (used to reset later) - std::streamsize ss = std::cout.precision(); - - size_t DATATYPE_SIZE; - - if (useFloat) - { - DATATYPE_SIZE = sizeof(float); - } - else - { - DATATYPE_SIZE = sizeof(double); - } - - // Display number of bytes in array - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - - // Reset precision - std::cout.precision(ss); - - // Check device index is in range - int count; - cudaGetDeviceCount(&count); - check_cuda_error(); - if (deviceIndex >= count) - throw std::runtime_error("Chosen device index is invalid"); - cudaSetDevice(deviceIndex); - check_cuda_error(); - - // Print out device name - std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; - - // Print out device CUDA driver version - std::cout << "Driver: " << getDriver() << std::endl; - - // Check buffers fit on the device - cudaDeviceProp props; - cudaGetDeviceProperties(&props, deviceIndex); - if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - // Create host vectors - void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); - - // Initilise arrays - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - if (useFloat) - { - ((float*)h_a)[i] = 1.0f; - ((float*)h_b)[i] = 2.0f; - ((float*)h_c)[i] = 0.0f; - } - else - { - ((double*)h_a)[i] = 1.0; - ((double*)h_b)[i] = 2.0; - ((double*)h_c)[i] = 0.0; - } - } - - // Create device buffers - void * d_a, * d_b, *d_c; - cudaMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - cudaMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - cudaMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); - check_cuda_error(); - - // Copy host memory to device - cudaMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - cudaMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); - check_cuda_error(); - - std::cout << "d_a=" << (void*)d_a << std::endl; - std::cout << "d_b=" << (void*)d_b << std::endl; - std::cout << "d_c=" << (void*)d_c << std::endl; - - // Make sure the copies are finished - cudaDeviceSynchronize(); - check_cuda_error(); - - // List of times - std::vector< std::vector > timings; - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < NTIMES; k++) - { - std::vector times; - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - copy_looper<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); - else - copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - copy<<>>((float*)d_a, (float*)d_c); - else - copy<<>>((double*)d_a, (double*)d_c); - } - - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - mul_looper<<>>((float*)d_b, (float*)d_c, ARRAY_SIZE); - else - mul_looper<<>>((double*)d_b, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - mul<<>>((float*)d_b, (float*)d_c); - else - mul<<>>((double*)d_b, (double*)d_c); - } - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - add_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); - else - add_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - add<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - add<<>>((double*)d_a, (double*)d_b, (double*)d_c); - } - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - triad_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); - else - triad_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); - } - check_cuda_error(); - cudaDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); - - } - - // Check solutions - cudaMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - cudaMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - cudaMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost); - check_cuda_error(); - - if (useFloat) - { - check_solution(h_a, h_b, h_c); - } - else - { - check_solution(h_a, h_b, h_c); - } - - // Crunch results - size_t sizes[4] = { - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE - }; - double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; - double max[4] = {0.0, 0.0, 0.0, 0.0}; - double avg[4] = {0.0, 0.0, 0.0, 0.0}; - - // Ignore first result - for (unsigned int i = 1; i < NTIMES; i++) - { - for (int j = 0; j < 4; j++) - { - avg[j] += timings[i][j]; - min[j] = std::min(min[j], timings[i][j]); - max[j] = std::max(max[j], timings[i][j]); - } - } - - for (int j = 0; j < 4; j++) - avg[j] /= (double)(NTIMES-1); - - double geomean = 1.0; - for (int j = 0; j < 4; j++) { - geomean *= (sizes[j]/min[j]); - } - geomean = pow(geomean, 0.25); - - // Display results - std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << "MBytes/sec" - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl; - - for (int j = 0; j < 4; j++) - { - std::cout - << std::left << std::setw(12) << labels[j] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] - << std::left << std::setw(12) << std::setprecision(5) << min[j] - << std::left << std::setw(12) << std::setprecision(5) << max[j] - << std::left << std::setw(12) << std::setprecision(5) << avg[j] - << std::endl; - } - std::cout - << std::left << std::setw(12) << "GEOMEAN" - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean - << std::endl; - - // Free host vectors - free(h_a); - free(h_b); - free(h_c); - - // Free cuda buffers - cudaFree(d_a); - check_cuda_error(); - cudaFree(d_b); - check_cuda_error(); - cudaFree(d_c); - check_cuda_error(); - -} - -std::string getDeviceName(int device) -{ - struct cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, device); - check_cuda_error(); - return std::string(prop.name); -} - -int getDriver(void) -{ - int driver; - cudaDriverGetVersion(&driver); - check_cuda_error(); - return driver; -} - -void listDevices(void) -{ - // Get number of devices - int count; - cudaGetDeviceCount(&count); - check_cuda_error(); - - // Print device names - if (count == 0) - { - std::cout << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (int i = 0; i < count; i++) - { - std::cout << i << ": " << getDeviceName(i) << std::endl; - check_cuda_error(); - } - std::cout << std::endl; - } -} - diff --git a/hip-stream.cpp b/hip-stream.cpp deleted file mode 100644 index d5fc133..0000000 --- a/hip-stream.cpp +++ /dev/null @@ -1,531 +0,0 @@ -#include "hip_runtime.h" -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#include -#include -#include -#include -#include -#include - -//#include -#include "common.h" - -std::string getDeviceName(int device); -int getDriver(void); - -// Code to check CUDA errors -void check_cuda_error(void) -{ - hipError_t err = hipGetLastError(); - if (err != hipSuccess) - { - std::cerr - << "Error: " - << hipGetErrorString(err) - << std::endl; - exit(err); - } -} - - - -// looper function place more work inside each work item. -// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations -template -__global__ void -copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) -{ - int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x; - - for (int i=offset; i -__global__ void -mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE) -{ - int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - int stride = hipBlockDim_x * hipGridDim_x; - const T scalar = 3.0; - - for (int i=offset; i -__global__ void -add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE) -{ - int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - int stride = hipBlockDim_x * hipGridDim_x; - - for (int i=offset; i -__global__ void -triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE) -{ - int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - int stride = hipBlockDim_x * hipGridDim_x; - const T scalar = 3.0; - - for (int i=offset; i -__global__ void -copy(hipLaunchParm lp, const T * a, T * c) -{ - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i]; -} - - -template -__global__ void -mul(hipLaunchParm lp, T * b, const T * c) -{ - const T scalar = 3.0; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - b[i] = scalar * c[i]; -} - -template -__global__ void -add(hipLaunchParm lp, const T * a, const T * b, T * c) -{ - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - c[i] = a[i] + b[i]; -} - -template -__global__ void -triad(hipLaunchParm lp, T * a, const T * b, const T * c) -{ - const T scalar = 3.0; - const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - a[i] = b[i] + scalar * c[i]; -} - -int main(int argc, char *argv[]) -{ - - // Print out run information - std::cout - << "GPU-STREAM" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: HIP" << std::endl; - - parseArguments(argc, argv); - - if (NTIMES < 2) - throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); - - // Config grid size and group size for kernel launching - int gridSize; - if (groups) { - gridSize = groups * groupSize; - } else { - gridSize = ARRAY_SIZE; - } - - float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; - std::cout << "GridSize: " << gridSize << " work-items" << std::endl; - std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; - std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; - if (groups) std::cout << "Using looper kernels:" << std::endl; - - std::cout << "Precision: "; - if (useFloat) std::cout << "float"; - else std::cout << "double"; - std::cout << std::endl << std::endl; - - std::cout << "Running kernels " << NTIMES << " times" << std::endl; - - if (ARRAY_SIZE % 1024 != 0) - { - unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; - ARRAY_SIZE -= ARRAY_SIZE % 1024; - std::cout - << "Warning: array size must divide 1024" << std::endl - << "Resizing array from " << OLD_ARRAY_SIZE - << " to " << ARRAY_SIZE << std::endl; - if (ARRAY_SIZE == 0) - throw std::runtime_error("Array size must be >= 1024"); - } - - // Get precision (used to reset later) - std::streamsize ss = std::cout.precision(); - - size_t DATATYPE_SIZE; - - if (useFloat) - { - DATATYPE_SIZE = sizeof(float); - } - else - { - DATATYPE_SIZE = sizeof(double); - } - - // Display number of bytes in array - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << " " << ARRAY_PAD_BYTES << " bytes padding" - << std::endl; - std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB" - << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)" - << std::endl; - - // Reset precision - std::cout.precision(ss); - - // Check device index is in range - int count; - hipGetDeviceCount(&count); - check_cuda_error(); - if (deviceIndex >= count) - throw std::runtime_error("Chosen device index is invalid"); - hipSetDevice(deviceIndex); - check_cuda_error(); - - - hipDeviceProp_t props; - hipGetDeviceProperties(&props, deviceIndex); - - // Print out device name - std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl; - - // Print out device HIP driver version - std::cout << "Driver: " << getDriver() << std::endl; - - - - - // Check buffers fit on the device - if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - //int cus = props.multiProcessorCount; - - // Create host vectors - void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE ); - void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE ); - void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE ); - - // Initialise arrays - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - if (useFloat) - { - ((float*)h_a)[i] = 1.0f; - ((float*)h_b)[i] = 2.0f; - ((float*)h_c)[i] = 0.0f; - } - else - { - ((double*)h_a)[i] = 1.0; - ((double*)h_b)[i] = 2.0; - ((double*)h_c)[i] = 0.0; - } - } - - // Create device buffers - char * d_a, * d_b, *d_c; - hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); - check_cuda_error(); - hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); - d_b += ARRAY_PAD_BYTES; - check_cuda_error(); - hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); - d_c += ARRAY_PAD_BYTES; - check_cuda_error(); - - // Copy host memory to device - hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); - check_cuda_error(); - hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); - check_cuda_error(); - hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); - check_cuda_error(); - - - std::cout << "d_a=" << (void*)d_a << std::endl; - std::cout << "d_b=" << (void*)d_b << std::endl; - std::cout << "d_c=" << (void*)d_c << std::endl; - - // Make sure the copies are finished - hipDeviceSynchronize(); - check_cuda_error(); - - - - // List of times - std::vector< std::vector > timings; - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < NTIMES; k++) - { - std::vector times; - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); - else - hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c); - } - check_cuda_error(); - hipDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE); - else - hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c); - } - check_cuda_error(); - hipDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); - else - hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); - } - check_cuda_error(); - hipDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - t1 = std::chrono::high_resolution_clock::now(); - if (groups) { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); - else - hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); - } else { - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); - } - - check_cuda_error(); - hipDeviceSynchronize(); - check_cuda_error(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); - - } - - // Check solutions - hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); - check_cuda_error(); - hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); - check_cuda_error(); - hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); - check_cuda_error(); - - if (useFloat) - { - check_solution(h_a, h_b, h_c); - } - else - { - check_solution(h_a, h_b, h_c); - } - - // Crunch results - size_t sizes[4] = { - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE - }; - double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; - double max[4] = {0.0, 0.0, 0.0, 0.0}; - double avg[4] = {0.0, 0.0, 0.0, 0.0}; - - // Ignore first result - for (unsigned int i = 1; i < NTIMES; i++) - { - for (int j = 0; j < 4; j++) - { - avg[j] += timings[i][j]; - min[j] = std::min(min[j], timings[i][j]); - max[j] = std::max(max[j], timings[i][j]); - } - } - - for (int j = 0; j < 4; j++) { - avg[j] /= (double)(NTIMES-1); - } - - double geomean = 1.0; - for (int j = 0; j < 4; j++) { - geomean *= (sizes[j]/min[j]); - } - geomean = pow(geomean, 0.25); - - // Display results - std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << "MBytes/sec" - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl; - - for (int j = 0; j < 4; j++) - { - std::cout - << std::left << std::setw(12) << labels[j] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] - << std::left << std::setw(12) << std::setprecision(5) << min[j] - << std::left << std::setw(12) << std::setprecision(5) << max[j] - << std::left << std::setw(12) << std::setprecision(5) << avg[j] - << std::endl; - } - std::cout - << std::left << std::setw(12) << "GEOMEAN" - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean - << std::endl; - - // Free host vectors - free(h_a); - free(h_b); - free(h_c); - - // Free cuda buffers - hipFree(d_a); - check_cuda_error(); - hipFree(d_b); - check_cuda_error(); - hipFree(d_c); - check_cuda_error(); - -} - -std::string getDeviceName(int device) -{ - struct hipDeviceProp_t prop; - hipGetDeviceProperties(&prop, device); - check_cuda_error(); - return std::string(prop.name); -} - -int getDriver(void) -{ - int driver; - hipDriverGetVersion(&driver); - check_cuda_error(); - return driver; -} - -void listDevices(void) -{ - // Get number of devices - int count; - hipGetDeviceCount(&count); - check_cuda_error(); - - // Print device names - if (count == 0) - { - std::cout << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (int i = 0; i < count; i++) - { - std::cout << i << ": " << getDeviceName(i) << std::endl; - check_cuda_error(); - } - std::cout << std::endl; - } -} - diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..5379bb9 --- /dev/null +++ b/main.cpp @@ -0,0 +1,332 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "Stream.h" + +#if defined(CUDA) +#include "CUDAStream.h" +#elif defined(OCL) +#include "OCLStream.h" +#elif defined(USE_RAJA) +#include "RAJAStream.hpp" +#elif defined(KOKKOS) +#include "KOKKOSStream.hpp" +#elif defined(ACC) +#include "ACCStream.h" +#elif defined(SYCL) +#include "SYCLStream.h" +#elif defined(OMP3) +#include "OMP3Stream.h" +#elif defined(OMP45) +#include "OMP45Stream.h" +#endif + +// Default size of 2^25 +unsigned int ARRAY_SIZE = 33554432; +unsigned int num_times = 100; +unsigned int deviceIndex = 0; +bool use_float = false; + + +template +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c); + +template +void run(); + +void parseArguments(int argc, char *argv[]); + +int main(int argc, char *argv[]) +{ + std::cout + << "GPU-STREAM" << std::endl + << "Version: " << VERSION_STRING << std::endl + << "Implementation: " << IMPLEMENTATION_STRING << std::endl; + + parseArguments(argc, argv); + + // TODO: Fix SYCL to allow multiple template specializations +#ifndef SYCL +#ifndef KOKKOS + if (use_float) + run(); + else +#endif +#endif + run(); + +} + +template +void run() +{ + std::cout << "Running kernels " << num_times << " times" << std::endl; + + if (sizeof(T) == sizeof(float)) + std::cout << "Precision: float" << std::endl; + else + std::cout << "Precision: double" << std::endl; + + // Create host vectors + std::vector a(ARRAY_SIZE, 0.1); + std::vector b(ARRAY_SIZE, 0.2); + std::vector c(ARRAY_SIZE, 0.0); + std::streamsize ss = std::cout.precision(); + std::cout << std::setprecision(1) << std::fixed + << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" + << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" + << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + std::cout.precision(ss); + + Stream *stream; + +#if defined(CUDA) + // Use the CUDA implementation + stream = new CUDAStream(ARRAY_SIZE, deviceIndex); + +#elif defined(OCL) + // Use the OpenCL implementation + stream = new OCLStream(ARRAY_SIZE, deviceIndex); + +#elif defined(USE_RAJA) + // Use the RAJA implementation + stream = new RAJAStream(ARRAY_SIZE, deviceIndex); + +#elif defined(KOKKOS) + // Use the Kokkos implementation + stream = new KOKKOSStream(ARRAY_SIZE, deviceIndex); + +#elif defined(ACC) + // Use the OpenACC implementation + stream = new ACCStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); + +#elif defined(SYCL) + // Use the SYCL implementation + stream = new SYCLStream(ARRAY_SIZE, deviceIndex); + +#elif defined(OMP3) + // Use the "reference" OpenMP 3 implementation + stream = new OMP3Stream(ARRAY_SIZE, a.data(), b.data(), c.data()); + +#elif defined(OMP45) + // Use the "reference" OpenMP 3 implementation + stream = new OMP45Stream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); + +#endif + + stream->write_arrays(a, b, c); + + // List of times + std::vector> timings(4); + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Main loop + for (unsigned int k = 0; k < num_times; k++) + { + // Execute Copy + t1 = std::chrono::high_resolution_clock::now(); + stream->copy(); + t2 = std::chrono::high_resolution_clock::now(); + timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); + + // Execute Mul + t1 = std::chrono::high_resolution_clock::now(); + stream->mul(); + t2 = std::chrono::high_resolution_clock::now(); + timings[1].push_back(std::chrono::duration_cast >(t2 - t1).count()); + + // Execute Add + t1 = std::chrono::high_resolution_clock::now(); + stream->add(); + t2 = std::chrono::high_resolution_clock::now(); + timings[2].push_back(std::chrono::duration_cast >(t2 - t1).count()); + + // Execute Triad + t1 = std::chrono::high_resolution_clock::now(); + stream->triad(); + t2 = std::chrono::high_resolution_clock::now(); + timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); + + } + + // Check solutions + stream->read_arrays(a, b, c); + check_solution(num_times, a, b, c); + + // Display timing results + std::cout + << std::left << std::setw(12) << "Function" + << std::left << std::setw(12) << "MBytes/sec" + << std::left << std::setw(12) << "Min (sec)" + << std::left << std::setw(12) << "Max" + << std::left << std::setw(12) << "Average" << std::endl; + + std::cout << std::fixed; + + std::string labels[4] = {"Copy", "Mul", "Add", "Triad"}; + size_t sizes[4] = { + 2 * sizeof(T) * ARRAY_SIZE, + 2 * sizeof(T) * ARRAY_SIZE, + 3 * sizeof(T) * ARRAY_SIZE, + 3 * sizeof(T) * ARRAY_SIZE + }; + + for (int i = 0; i < 4; i++) + { + // Get min/max; ignore the first result + auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); + + // Calculate average; ignore the first result + double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1); + + // Display results + std::cout + << std::left << std::setw(12) << labels[i] + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first) + << std::left << std::setw(12) << std::setprecision(5) << *minmax.first + << std::left << std::setw(12) << std::setprecision(5) << *minmax.second + << std::left << std::setw(12) << std::setprecision(5) << average + << std::endl; + + } + + delete stream; + +} + +template +void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c) +{ + // Generate correct solution + T goldA = 0.1; + T goldB = 0.2; + T goldC = 0.0; + + const T scalar = 0.3; + + for (unsigned int i = 0; i < ntimes; i++) + { + // Do STREAM! + goldC = goldA; + goldB = scalar * goldC; + goldC = goldA + goldB; + goldA = goldB + scalar * goldC; + } + + // Calculate the average error + double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); + errA /= a.size(); + double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); }); + errB /= b.size(); + double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); }); + errC /= c.size(); + + double epsi = std::numeric_limits::epsilon() * 100.0; + + if (errA > epsi) + std::cerr + << "Validation failed on a[]. Average error " << errA + << std::endl; + if (errB > epsi) + std::cerr + << "Validation failed on b[]. Average error " << errB + << std::endl; + if (errC > epsi) + std::cerr + << "Validation failed on c[]. Average error " << errC + << std::endl; + +} + +int parseUInt(const char *str, unsigned int *output) +{ + char *next; + *output = strtoul(str, &next, 10); + return !strlen(next); +} + +void parseArguments(int argc, char *argv[]) +{ + for (int i = 1; i < argc; i++) + { + if (!std::string("--list").compare(argv[i])) + { + listDevices(); + exit(EXIT_SUCCESS); + } + else if (!std::string("--device").compare(argv[i])) + { + if (++i >= argc || !parseUInt(argv[i], &deviceIndex)) + { + std::cerr << "Invalid device index." << std::endl; + exit(EXIT_FAILURE); + } + } + else if (!std::string("--arraysize").compare(argv[i]) || + !std::string("-s").compare(argv[i])) + { + if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE)) + { + std::cerr << "Invalid array size." << std::endl; + exit(EXIT_FAILURE); + } + } + else if (!std::string("--numtimes").compare(argv[i]) || + !std::string("-n").compare(argv[i])) + { + if (++i >= argc || !parseUInt(argv[i], &num_times)) + { + std::cerr << "Invalid number of times." << std::endl; + exit(EXIT_FAILURE); + } + if (num_times < 2) + { + std::cerr << "Number of times must be 2 or more" << std::endl; + exit(EXIT_FAILURE); + } + } + else if (!std::string("--float").compare(argv[i])) + { + use_float = true; + } + else if (!std::string("--help").compare(argv[i]) || + !std::string("-h").compare(argv[i])) + { + std::cout << std::endl; + std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl; + std::cout << "Options:" << std::endl; + std::cout << " -h --help Print the message" << std::endl; + std::cout << " --list List available devices" << std::endl; + std::cout << " --device INDEX Select device at INDEX" << std::endl; + std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; + std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; + std::cout << " --float Use floats (rather than doubles)" << std::endl; + std::cout << std::endl; + exit(EXIT_SUCCESS); + } + else + { + std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')" + << std::endl; + exit(EXIT_FAILURE); + } + } +} diff --git a/ocl-stream-kernels.cl b/ocl-stream-kernels.cl deleted file mode 100644 index e5af7ce..0000000 --- a/ocl-stream-kernels.cl +++ /dev/null @@ -1,70 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#ifdef FLOAT - #define DATATYPE float - constant DATATYPE scalar = 3.0f; -#else - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #define DATATYPE double - constant DATATYPE scalar = 3.0; -#endif - - -kernel void copy(global const DATATYPE * restrict a, global DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - c[i] = a[i]; -} - -kernel void mul(global DATATYPE * restrict b, global const DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - b[i] = scalar * c[i]; -} - -kernel void add(global const DATATYPE * restrict a, global const DATATYPE * restrict b, global DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - c[i] = a[i] + b[i]; -} - -kernel void triad(global DATATYPE * restrict a, global const DATATYPE * restrict b, global const DATATYPE * restrict c) -{ - const size_t i = get_global_id(0); - a[i] = b[i] + scalar * c[i]; -} diff --git a/ocl-stream.cpp b/ocl-stream.cpp deleted file mode 100644 index 1a46295..0000000 --- a/ocl-stream.cpp +++ /dev/null @@ -1,488 +0,0 @@ -/*============================================================================= -*------------------------------------------------------------------------------ -* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC -* Based on John D. McCalpin’s original STREAM benchmark for CPUs -*------------------------------------------------------------------------------ -* License: -* 1. You are free to use this program and/or to redistribute -* this program. -* 2. You are free to modify this program for your own use, -* including commercial use, subject to the publication -* restrictions in item 3. -* 3. You are free to publish results obtained from running this -* program, or from works that you derive from this program, -* with the following limitations: -* 3a. In order to be referred to as "GPU-STREAM benchmark results", -* published results must be in conformance to the GPU-STREAM -* Run Rules published at -* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules -* and incorporated herein by reference. -* The copyright holders retain the -* right to determine conformity with the Run Rules. -* 3b. Results based on modified source code or on runs not in -* accordance with the GPU-STREAM Run Rules must be clearly -* labelled whenever they are published. Examples of -* proper labelling include: -* "tuned GPU-STREAM benchmark results" -* "based on a variant of the GPU-STREAM benchmark code" -* Other comparable, clear and reasonable labelling is -* acceptable. -* 3c. Submission of results to the GPU-STREAM benchmark web site -* is encouraged, but not required. -* 4. Use of this program or creation of derived works based on this -* program constitutes acceptance of these licensing restrictions. -* 5. Absolutely no warranty is expressed or implied. -*———————————————————————————————————-----------------------------------------*/ - - -#include -#include -#include -#include -#include -#include - -#define CL_HPP_ENABLE_EXCEPTIONS -#define CL_HPP_MINIMUM_OPENCL_VERSION 110 -#define CL_HPP_TARGET_OPENCL_VERSION 110 -#include "CL/cl2.hpp" -#include "common.h" - -std::string getDeviceName(const cl::Device& device); -std::string getDeviceDriver(const cl::Device& device); -unsigned getDeviceList(std::vector& devices); - - -// Print error and exit -void die(std::string msg, cl::Error& e) -{ - std::cerr - << "Error: " - << msg - << ": " << e.what() - << "(" << e.err() << ")" - << std::endl; - exit(e.err()); -} - - -int main(int argc, char *argv[]) -{ - - // Print out run information - std::cout - << "GPU-STREAM" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: OpenCL" << std::endl; - - std::string status; - - try - { - parseArguments(argc, argv); - if (NTIMES < 2) - throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); - - - std::cout << "Precision: "; - if (useFloat) std::cout << "float"; - else std::cout << "double"; - std::cout << std::endl << std::endl; - - std::cout << "Running kernels " << NTIMES << " times" << std::endl; - - if (ARRAY_SIZE % 1024 != 0) - { - unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; - ARRAY_SIZE -= ARRAY_SIZE % 1024; - std::cout - << "Warning: array size must divide 1024" << std::endl - << "Resizing array from " << OLD_ARRAY_SIZE - << " to " << ARRAY_SIZE << std::endl; - if (ARRAY_SIZE == 0) - throw std::runtime_error("Array size must be >= 1024"); - } - - // Get precision (used to reset later) - std::streamsize ss = std::cout.precision(); - - size_t DATATYPE_SIZE; - - if (useFloat) - { - DATATYPE_SIZE = sizeof(float); - } - else - { - DATATYPE_SIZE = sizeof(double); - } - - // Display number of bytes in array - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" - << std::endl; - - // Reset precision - std::cout.precision(ss); - - // Open the Kernel source - std::string kernels; - { - std::ifstream in("ocl-stream-kernels.cl"); - if (!in.is_open()) - throw std::runtime_error("Cannot open kernel file"); - kernels = std::string (std::istreambuf_iterator(in), (std::istreambuf_iterator())); - } - - - // Setup OpenCL - - // Get list of devices - std::vector devices; - getDeviceList(devices); - - // Check device index is in range - if (deviceIndex >= devices.size()) - throw std::runtime_error("Chosen device index is invalid"); - - cl::Device device = devices[deviceIndex]; - - status = "Creating context"; - cl::Context context(device); - - status = "Creating queue"; - cl::CommandQueue queue(context); - - status = "Creating program"; - cl::Program program(context, kernels); - - // Print out device name - std::string name = getDeviceName(device); - std::cout << "Using OpenCL device " << name << std::endl; - - // Print out OpenCL driver version for this device - std::string driver = getDeviceDriver(device); - std::cout << "Driver: " << driver << std::endl; - - // Check device can do double precision if requested - if (!useFloat && !device.getInfo()) - throw std::runtime_error("Device does not support double precision, please use --float"); - - // Check buffers fit on the device - status = "Getting device memory sizes"; - cl_ulong totalmem = device.getInfo(); - cl_ulong maxbuffer = device.getInfo(); - if (maxbuffer < DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device cannot allocate a buffer big enough"); - if (totalmem < 3*DATATYPE_SIZE*ARRAY_SIZE) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - try - { - std::string options = ""; - if (useFloat) - options = "-DFLOAT"; - program.build(options.c_str()); - } - catch (cl::Error& e) - { - std::vector devices = context.getInfo(); - std::string buildlog = program.getBuildInfo(devices[0]); - std::cerr - << "Build error:" - << buildlog - << std::endl; - throw e; - } - - status = "Making kernel copy"; - auto copy = cl::KernelFunctor(program, "copy"); - status = "Making kernel mul"; - auto mul = cl::KernelFunctor(program, "mul"); - status = "Making kernel add"; - auto add = cl::KernelFunctor(program, "add"); - status = "Making kernel triad"; - auto triad = cl::KernelFunctor(program, "triad"); - - // Create host vectors - void *h_a = malloc(ARRAY_SIZE * DATATYPE_SIZE); - void *h_b = malloc(ARRAY_SIZE * DATATYPE_SIZE); - void *h_c = malloc(ARRAY_SIZE * DATATYPE_SIZE); - - // Initilise arrays - for (unsigned int i = 0; i < ARRAY_SIZE; i++) - { - if (useFloat) - { - ((float*)h_a)[i] = 1.0f; - ((float*)h_b)[i] = 2.0f; - ((float*)h_c)[i] = 0.0f; - } - else - { - ((double*)h_a)[i] = 1.0; - ((double*)h_b)[i] = 2.0; - ((double*)h_c)[i] = 0.0; - } - } - - // Create device buffers - status = "Creating buffers"; - cl::Buffer d_a(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - cl::Buffer d_b(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - cl::Buffer d_c(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE); - - - // Copy host memory to device - status = "Copying buffers"; - queue.enqueueWriteBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a); - queue.enqueueWriteBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b); - queue.enqueueWriteBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c); - - // Make sure the copies are finished - queue.finish(); - - - // List of times - std::vector< std::vector > timings; - - // Declare timers - std::chrono::high_resolution_clock::time_point t1, t2; - - // Main loop - for (unsigned int k = 0; k < NTIMES; k++) - { - status = "Executing copy"; - std::vector times; - t1 = std::chrono::high_resolution_clock::now(); - copy( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing mul"; - t1 = std::chrono::high_resolution_clock::now(); - mul( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing add"; - t1 = std::chrono::high_resolution_clock::now(); - add( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - - status = "Executing triad"; - t1 = std::chrono::high_resolution_clock::now(); - triad( - cl::EnqueueArgs( - queue, - cl::NDRange(ARRAY_SIZE)), - d_a, d_b, d_c); - queue.finish(); - t2 = std::chrono::high_resolution_clock::now(); - times.push_back(std::chrono::duration_cast >(t2 - t1).count()); - - timings.push_back(times); - - } - - // Check solutions - status = "Copying back buffers"; - queue.enqueueReadBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a); - queue.enqueueReadBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b); - queue.enqueueReadBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c); - queue.finish(); - - - if (useFloat) - { - check_solution(h_a, h_b, h_c); - } - else - { - check_solution(h_a, h_b, h_c); - } - - // Crunch results - size_t sizes[4] = { - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 2 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE, - 3 * DATATYPE_SIZE * ARRAY_SIZE - }; - double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; - double max[4] = {0.0, 0.0, 0.0, 0.0}; - double avg[4] = {0.0, 0.0, 0.0, 0.0}; - // Ignore first result - for (unsigned int i = 1; i < NTIMES; i++) - { - for (int j = 0; j < 4; j++) - { - avg[j] += timings[i][j]; - min[j] = std::min(min[j], timings[i][j]); - max[j] = std::max(max[j], timings[i][j]); - } - } - for (int j = 0; j < 4; j++) - avg[j] /= (double)(NTIMES-1); - - // Display results - std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << "MBytes/sec" - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl; - for (int j = 0; j < 4; j++) - { - std::cout - << std::left << std::setw(12) << labels[j] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] - << std::left << std::setw(12) << std::setprecision(5) << min[j] - << std::left << std::setw(12) << std::setprecision(5) << max[j] - << std::left << std::setw(12) << std::setprecision(5) << avg[j] - << std::endl; - } - - // Free host vectors - free(h_a); - free(h_b); - free(h_c); - - } - catch (cl::Error &e) - { - die(status, e); - } - catch (std::exception& e) - { - std::cerr - << "Error: " - << e.what() - << std::endl; - exit(EXIT_FAILURE); - } - -} - - -unsigned getDeviceList(std::vector& devices) -{ - // Get list of platforms - std::vector platforms; - try - { - cl::Platform::get(&platforms); - } - catch (cl::Error &e) - { - die("Getting platforms", e); - } - - // Enumerate devices - for (unsigned int i = 0; i < platforms.size(); i++) - { - std::vector plat_devices; - try - { - platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); - } - catch (cl::Error &e) - { - die("Getting devices", e); - } - devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); - } - - return devices.size(); -} - - -std::string getDeviceName(const cl::Device& device) -{ - std::string name; - cl_device_info info = CL_DEVICE_NAME; - - try - { - - // Special case for AMD -#ifdef CL_DEVICE_BOARD_NAME_AMD - device.getInfo(CL_DEVICE_VENDOR, &name); - if (strstr(name.c_str(), "Advanced Micro Devices")) - info = CL_DEVICE_BOARD_NAME_AMD; -#endif - - device.getInfo(info, &name); - } - catch (cl::Error &e) - { - die("Getting device name", e); - } - - return name; -} - -std::string getDeviceDriver(const cl::Device& device) -{ - std::string driver; - try - { - device.getInfo(CL_DRIVER_VERSION, &driver); - } - catch (cl::Error &e) - { - die("Getting device driver", e); - } - - return driver; -} - - -void listDevices(void) -{ - // Get list of devices - std::vector devices; - getDeviceList(devices); - - // Print device names - if (devices.size() == 0) - { - std::cout << "No devices found." << std::endl; - } - else - { - std::cout << std::endl; - std::cout << "Devices:" << std::endl; - for (unsigned i = 0; i < devices.size(); i++) - { - std::cout << i << ": " << getDeviceName(devices[i]) << std::endl; - } - std::cout << std::endl; - } -} - diff --git a/runcuda.sh b/runcuda.sh deleted file mode 100755 index 7acf5c1..0000000 --- a/runcuda.sh +++ /dev/null @@ -1,4 +0,0 @@ -./gpu-stream-cuda -./gpu-stream-cuda --groups 64 --groupSize 256 -./gpu-stream-cuda --float -./gpu-stream-cuda --float --groups 64 --groupSize 256 diff --git a/runhip.sh b/runhip.sh deleted file mode 100755 index b84e970..0000000 --- a/runhip.sh +++ /dev/null @@ -1,4 +0,0 @@ -./gpu-stream-hip -./gpu-stream-hip --groups 64 --groupSize 256 -./gpu-stream-hip --float -./gpu-stream-hip --float --groups 64 --groupSize 256