Merge branch 'refactor'

This commit is contained in:
Tom Deakin 2016-06-30 16:51:40 +01:00
commit eda9d56ed9
34 changed files with 2469 additions and 2023 deletions

16
.gitignore vendored
View File

@ -1,8 +1,20 @@
common.h
gpu-stream-cuda
gpu-stream-ocl
gpu-stream-acc
gpu-stream-omp3
gpu-stream-omp45
gpu-stream-sycl
*.o
*.tar
*.gz
.DS_Store
CMakeCache.txt
CMakeFiles/
cmake_install.cmake
Makefile

141
ACCStream.cpp Normal file
View File

@ -0,0 +1,141 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "ACCStream.h"
template <class T>
ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
{
acc_set_device_num(device, acc_device_nvidia);
array_size = ARRAY_SIZE;
// Set up data region on device
this->a = a;
this->b = b;
this->c = c;
#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
ACCStream<T>::~ACCStream()
{
// End data region on device
unsigned int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void ACCStream<T>::copy()
{
unsigned int array_size = this->array_size;
T * restrict a = this->a;
T * restrict c = this->c;
#pragma acc kernels present(a[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
c[i] = a[i];
}
}
template <class T>
void ACCStream<T>::mul()
{
const T scalar = 0.3;
unsigned int array_size = this->array_size;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc kernels present(b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
b[i] = scalar * c[i];
}
}
template <class T>
void ACCStream<T>::add()
{
unsigned int array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
c[i] = a[i] + b[i];
}
}
template <class T>
void ACCStream<T>::triad()
{
const T scalar = 0.3;
unsigned int array_size = this->array_size;
T * restrict a = this->a;
T * restrict b = this->b;
T * restrict c = this->c;
#pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
a[i] = b[i] + scalar * c[i];
}
}
void listDevices(void)
{
// Get number of devices
int count = acc_get_num_devices(acc_device_nvidia);
// Print device list
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << "There are " << count << " devices." << std::endl;
}
}
std::string getDeviceName(const int)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class ACCStream<float>;
template class ACCStream<double>;

44
ACCStream.h Normal file
View File

@ -0,0 +1,44 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#include <openacc.h>
#define IMPLEMENTATION_STRING "OpenACC"
template <class T>
class ACCStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers
T *a;
T *b;
T *c;
public:
ACCStream(const unsigned int, T*, T*, T*, int);
~ACCStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -28,11 +28,11 @@
/*! \file
*
* \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
* \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
* OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29)
* \author Lee Howes and Bruce Merry
*
* Derived from the OpenCL 1.x C++ bindings written by
*
* Derived from the OpenCL 1.x C++ bindings written by
* Benedict R. Gaster, Laurent Morichetti and Lee Howes
* With additions and fixes from:
* Brian Cole, March 3rd 2010 and April 2012
@ -52,6 +52,18 @@
* #define CL_HPP_USE_DX_INTEROP
* cl_khr_sub_groups
* #define CL_HPP_USE_CL_SUB_GROUPS_KHR
*
* Doxygen documentation for this header is available here:
*
* http://khronosgroup.github.io/OpenCL-CLHPP/
*
* The latest version of this header can be found on the GitHub releases page:
*
* https://github.com/KhronosGroup/OpenCL-CLHPP/releases
*
* Bugs and patches can be submitted to the GitHub repository:
*
* https://github.com/KhronosGroup/OpenCL-CLHPP
*/
/*! \mainpage
@ -134,41 +146,64 @@
*
* \section parameterization Parameters
* This header may be parameterized by a set of preprocessor macros.
* CL_HPP_TARGET_OPENCL_VERSION
* - Defines the target OpenCL runtime version to build the header against.
* Defaults to 200, representing OpenCL 2.0.
* CL_HPP_NO_STD_STRING
* - Do not use the standard library string class.
* cl::string is not defined and may be defined by the user before
* cl2.hpp is included.
* CL_HPP_NO_STD_VECTOR
* - Do not use the standard library vector class.
* cl::vector is not defined and may be defined by the user before
* cl2.hpp is included.
* CL_HPP_NO_STD_ARRAY
* - Do not use the standard library array class.
* cl::array is not defined and may be defined by the user before
* cl2.hpp is included.
* CL_HPP_NO_STD_UNIQUE_PTR
* - Do not use the standard library unique_ptr class.
* cl::pointer and the cl::allocate_pointer function are not defined
* and may be defined by the user before cl2.hpp is included.
* CL_HPP_ENABLE_DEVICE_FISSION
* - Enables device fission for OpenCL 1.2 platforms
* CL_HPP_ENABLE_EXCEPTIONS
* - Enable exceptions for use in the C++ bindings header.
* This is the preferred error handling mechanism but is not required.
* CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
* - Backward compatibility option to support cl.hpp-style size_t class.
* Replaces the updated std::array derived version and removal of size_t
* from the namespace. Note that in this case the new size_t class
* is placed in the cl::compatibility namespace and thus requires
* an additional using declaration for direct backward compatibility.
* CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
* - Enable older vector of pairs interface for construction of programs.
* CL_HPP_CL_1_2_DEFAULT_BUILD
* - Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
* - applies to use of cl::Program construction and other program build variants.
*
* - CL_HPP_TARGET_OPENCL_VERSION
*
* Defines the target OpenCL runtime version to build the header
* against. Defaults to 200, representing OpenCL 2.0.
*
* - CL_HPP_NO_STD_STRING
*
* Do not use the standard library string class. cl::string is not
* defined and may be defined by the user before cl2.hpp is
* included.
*
* - CL_HPP_NO_STD_VECTOR
*
* Do not use the standard library vector class. cl::vector is not
* defined and may be defined by the user before cl2.hpp is
* included.
*
* - CL_HPP_NO_STD_ARRAY
*
* Do not use the standard library array class. cl::array is not
* defined and may be defined by the user before cl2.hpp is
* included.
*
* - CL_HPP_NO_STD_UNIQUE_PTR
*
* Do not use the standard library unique_ptr class. cl::pointer and
* the cl::allocate_pointer functions are not defined and may be
* defined by the user before cl2.hpp is included.
*
* - CL_HPP_ENABLE_DEVICE_FISSION
*
* Enables device fission for OpenCL 1.2 platforms.
*
* - CL_HPP_ENABLE_EXCEPTIONS
*
* Enable exceptions for use in the C++ bindings header. This is the
* preferred error handling mechanism but is not required.
*
* - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
*
* Backward compatibility option to support cl.hpp-style size_t
* class. Replaces the updated std::array derived version and
* removal of size_t from the namespace. Note that in this case the
* new size_t class is placed in the cl::compatibility namespace and
* thus requires an additional using declaration for direct backward
* compatibility.
*
* - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
*
* Enable older vector of pairs interface for construction of
* programs.
*
* - CL_HPP_CL_1_2_DEFAULT_BUILD
*
* Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
* applies to use of cl::Program construction and other program
* build variants.
*
*
* \section example Example
@ -177,19 +212,19 @@
* bindings, including support for the optional exception feature and
* also the supplied vector and string classes, see following sections for
* decriptions of these features.
*
*
* \code
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include <CL/cl2.hpp>
#include <iostream>
#include <vector>
#include <memory>
#include <algorithm>
const int numElements = 32;
int main(void)
{
// Filter for a 2.0 platform and set it as the default
@ -212,35 +247,45 @@
std::cout << "Error setting default platform.";
return -1;
}
std::string kernel1{
"global int globalA;"
"kernel void updateGlobal(){"
" globalA = 75;"
"}"};
std::string kernel2{
"typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, global int *output, int val, write_only pipe int outPipe, queue_t childQueue){"
" output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);"
" write_pipe(outPipe, &val);"
" queue_t default_queue = get_default_queue(); "
" ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); "
// Have a child kernel write into third quarter of output
" enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
" ^{"
" output[get_global_size(0)*2 + get_global_id(0)] = inputA[get_global_size(0)*2+get_global_id(0)] + inputB[get_global_size(0)*2+get_global_id(0)] + globalA;"
" });"
// Have a child kernel write into last quarter of output
" enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
" ^{"
" output[get_global_size(0)*3 + get_global_id(0)] = inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;"
" });"
"}" };
// Use C++11 raw string literals for kernel source code
std::string kernel1{R"CLC(
global int globalA;
kernel void updateGlobal()
{
globalA = 75;
}
)CLC"};
std::string kernel2{R"CLC(
typedef struct { global int *bar; } Foo;
kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
global int *output, int val, write_only pipe int outPipe, queue_t childQueue)
{
output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);
write_pipe(outPipe, &val);
queue_t default_queue = get_default_queue();
ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);
// Have a child kernel write into third quarter of output
enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
^{
output[get_global_size(0)*2 + get_global_id(0)] =
inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;
});
// Have a child kernel write into last quarter of output
enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
^{
output[get_global_size(0)*3 + get_global_id(0)] =
inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;
});
}
)CLC"};
// New simpler string interface style
std::vector<std::string> programStrings {kernel1, kernel2};
cl::Program vectorAddProgram(
programStrings);
cl::Program vectorAddProgram(programStrings);
try {
vectorAddProgram.build("-cl-std=CL2.0");
}
@ -251,7 +296,7 @@
for (auto &pair : buildInfo) {
std::cerr << pair.second << std::endl << std::endl;
}
return 1;
}
@ -264,17 +309,17 @@
program2Kernel(
cl::EnqueueArgs(
cl::NDRange(1)));
//////////////////
// SVM allocations
cl::pointer<int> anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
*anSVMInt = 5;
cl::SVMAllocator<int, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
fooPointer->bar = anSVMInt.get();
cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
//
@ -284,7 +329,7 @@
std::vector<int> output(numElements, 0xdeadbeef);
cl::Buffer outputBuffer(begin(output), end(output), false);
cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
// Default command queue, also passed in as a parameter
cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault(
cl::Context::getDefault(), cl::Device::getDefault());
@ -339,7 +384,7 @@
return 0;
}
*
*
* \endcode
*
*/
@ -3538,7 +3583,7 @@ template <class T, class Alloc, class... Args>
cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
{
Alloc alloc(alloc_);
static const size_t copies = 1;
static const size_type copies = 1;
// Ensure that creation of the management block and the
// object are dealt with separately such that we only provide a deleter
@ -6520,7 +6565,7 @@ inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned
// Resize the parameter array and constituent arrays
param->resize(numBinaries);
for (int i = 0; i < numBinaries; ++i) {
for (size_type i = 0; i < numBinaries; ++i) {
(*param)[i].resize(sizes[i]);
}
@ -7107,7 +7152,7 @@ public:
size_type buffer_slice_pitch,
size_type host_row_pitch,
size_type host_slice_pitch,
void *ptr,
const void *ptr,
const vector<Event>* events = NULL,
Event* event = NULL) const
{
@ -7255,7 +7300,7 @@ public:
const array<size_type, 3>& region,
size_type row_pitch,
size_type slice_pitch,
void* ptr,
const void* ptr,
const vector<Event>* events = NULL,
Event* event = NULL) const
{
@ -7845,7 +7890,7 @@ public:
CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
const Kernel& kernel,
const vector<Event>* events = NULL,
Event* event = NULL) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED const
Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
{
cl_event tmp;
cl_int err = detail::errHandler(
@ -8873,7 +8918,7 @@ inline cl_int enqueueWriteBufferRect(
size_type buffer_slice_pitch,
size_type host_row_pitch,
size_type host_slice_pitch,
void *ptr,
const void *ptr,
const vector<Event>* events = NULL,
Event* event = NULL)
{
@ -8971,7 +9016,7 @@ inline cl_int enqueueWriteImage(
const array<size_type, 3>& region,
size_type row_pitch,
size_type slice_pitch,
void* ptr,
const void* ptr,
const vector<Event>* events = NULL,
Event* event = NULL)
{

164
CMakeLists.txt Normal file
View File

@ -0,0 +1,164 @@
cmake_minimum_required(VERSION 3.2)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
project(gpu-stream)
include(CheckIncludeFileCXX)
include(CheckCXXCompilerFlag)
set(gpu-stream_VERSION_MAJOR 2)
set(gpu-stream_VERSION_MINOR 0)
configure_file(common.h.in common.h)
# If using the Cray compiler, manually add the C++11 flag because setting the
# standard through CMake as above doesn't set this flag with Cray
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
endif ()
#-------------------------------------------------------------------------------
# CUDA
#-------------------------------------------------------------------------------
find_package(CUDA 7.0 QUIET)
set(FLAG True)
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
message("Xcode version not supported by CUDA")
set(FLAG False)
endif ()
endif ()
if (${FLAG} AND ${CUDA_FOUND})
list(APPEND CUDA_NVCC_FLAGS --std=c++11)
cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
else ()
message("Skipping CUDA...")
endif ()
#-------------------------------------------------------------------------------
# OpenCL
#-------------------------------------------------------------------------------
find_package(OpenCL QUIET)
if (${OpenCL_FOUND})
add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
else ()
message("Skipping OpenCL...")
endif ()
#-------------------------------------------------------------------------------
# OpenACC
#-------------------------------------------------------------------------------
# Check compiler supports an OpenACC flag
include(CheckCXXCompilerFlag)
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
if (OPENACC)
list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
endif ()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
endif ()
if (OPENACC)
add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
target_compile_definitions(gpu-stream-acc PUBLIC ACC)
else ()
message("Skipping OpenACC...")
endif ()
#-------------------------------------------------------------------------------
# OpenMP 3.0
#-------------------------------------------------------------------------------
find_package(OpenMP QUIET)
if (${OpenMP_FOUND})
add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
else ()
message("Skipping OpenMP 3...")
endif ()
#-------------------------------------------------------------------------------
# OpenMP 4.5
#-------------------------------------------------------------------------------
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
endif ()
endif ()
#-------------------------------------------------------------------------------
# RAJA
#-------------------------------------------------------------------------------
if (RAJA_PATH)
find_package(OpenMP)
find_package(CUDA 7.5)
list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
cuda_include_directories(${RAJA_PATH}/include)
set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
else()
message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
endif()
#-------------------------------------------------------------------------------
# Kokkos
#-------------------------------------------------------------------------------
if (KOKKOS_PATH)
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
else()
message("Skipping Kokkos (requires Linux)")
endif()
else()
message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
endif()
#-------------------------------------------------------------------------------
# SYCL
#-------------------------------------------------------------------------------
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
"${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
# Use C++14 if available, otherwise drop back to C++11
check_cxx_compiler_flag("-std=c++14" CXX14)
if (CXX14)
set(CMAKE_REQUIRED_FLAGS "-std=c++14")
else()
set(CMAKE_REQUIRED_FLAGS "-std=c++11")
endif()
endif()
check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
if (HAS_SYCL)
add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
find_program(COMPUTECPP "compute++")
if (COMPUTECPP)
message(STATUS "Using ComputeCpp for SYCL compilation")
add_custom_target(SYCLStream.sycl COMMAND ${COMPUTECPP} ${CMAKE_CURRENT_SOURCE_DIR}/SYCLStream.cpp -sycl -no-serial-memop -O2 -emit-llvm -c)
add_dependencies(gpu-stream-sycl SYCLStream.sycl)
target_compile_options(gpu-stream-sycl PUBLIC -include SYCLStream.sycl)
target_link_libraries(gpu-stream-sycl SYCL OpenCL)
else()
message(STATUS "Using header-only SYCL implementation")
set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
endif(COMPUTECPP)
else ()
message("Skipping SYCL...")
endif (HAS_SYCL)

214
CUDAStream.cu Normal file
View File

@ -0,0 +1,214 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "CUDAStream.h"
#define TBSIZE 1024
void check_error(void)
{
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Error: " << cudaGetErrorString(err) << std::endl;
exit(err);
}
}
template <class T>
CUDAStream<T>::CUDAStream(const unsigned int ARRAY_SIZE, const int device_index)
{
// The array size must be divisible by TBSIZE for kernel launches
if (ARRAY_SIZE % TBSIZE != 0)
{
std::stringstream ss;
ss << "Array size must be a multiple of " << TBSIZE;
throw std::runtime_error(ss.str());
}
// Set device
int count;
cudaGetDeviceCount(&count);
check_error();
if (device_index >= count)
throw std::runtime_error("Invalid device index");
cudaSetDevice(device_index);
check_error();
// Print out device information
std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
array_size = ARRAY_SIZE;
// Check buffers fit on the device
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers
cudaMalloc(&d_a, ARRAY_SIZE*sizeof(T));
check_error();
cudaMalloc(&d_b, ARRAY_SIZE*sizeof(T));
check_error();
cudaMalloc(&d_c, ARRAY_SIZE*sizeof(T));
check_error();
}
template <class T>
CUDAStream<T>::~CUDAStream()
{
cudaFree(d_a);
check_error();
cudaFree(d_b);
check_error();
cudaFree(d_c);
check_error();
}
template <class T>
void CUDAStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
// Copy host memory to device
cudaMemcpy(d_a, a.data(), a.size()*sizeof(T), cudaMemcpyHostToDevice);
check_error();
cudaMemcpy(d_b, b.data(), b.size()*sizeof(T), cudaMemcpyHostToDevice);
check_error();
cudaMemcpy(d_c, c.data(), c.size()*sizeof(T), cudaMemcpyHostToDevice);
check_error();
}
template <class T>
void CUDAStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
// Copy device memory to host
cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost);
check_error();
}
template <typename T>
__global__ void copy_kernel(const T * a, T * c)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i];
}
template <class T>
void CUDAStream<T>::copy()
{
copy_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void mul_kernel(T * b, const T * c)
{
const T scalar = 0.3;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
b[i] = scalar * c[i];
}
template <class T>
void CUDAStream<T>::mul()
{
mul_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void add_kernel(const T * a, const T * b, T * c)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
template <class T>
void CUDAStream<T>::add()
{
add_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void triad_kernel(T * a, const T * b, const T * c)
{
const T scalar = 0.3;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = b[i] + scalar * c[i];
}
template <class T>
void CUDAStream<T>::triad()
{
triad_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error();
cudaDeviceSynchronize();
check_error();
}
void listDevices(void)
{
// Get number of devices
int count;
cudaGetDeviceCount(&count);
check_error();
// Print device names
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
check_error();
return std::string(props.name);
}
std::string getDeviceDriver(const int device)
{
cudaSetDevice(device);
check_error();
int driver;
cudaDriverGetVersion(&driver);
check_error();
return std::to_string(driver);
}
template class CUDAStream<float>;
template class CUDAStream<double>;

43
CUDAStream.h Normal file
View File

@ -0,0 +1,43 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include <sstream>
#include "Stream.h"
#define IMPLEMENTATION_STRING "CUDA"
template <class T>
class CUDAStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers to arrays
T *d_a;
T *d_b;
T *d_c;
public:
CUDAStream(const unsigned int, const int);
~CUDAStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

142
KOKKOSStream.cpp Normal file
View File

@ -0,0 +1,142 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "KOKKOSStream.hpp"
using namespace Kokkos;
template <class T>
KOKKOSStream<T>::KOKKOSStream(
const unsigned int ARRAY_SIZE, const int device_index)
: array_size(ARRAY_SIZE)
{
Kokkos::initialize();
d_a = new View<double*, DEVICE>("d_a", ARRAY_SIZE);
d_b = new View<double*, DEVICE>("d_b", ARRAY_SIZE);
d_c = new View<double*, DEVICE>("d_c", ARRAY_SIZE);
hm_a = new View<double*, DEVICE>::HostMirror();
hm_b = new View<double*, DEVICE>::HostMirror();
hm_c = new View<double*, DEVICE>::HostMirror();
*hm_a = create_mirror_view(*d_a);
*hm_b = create_mirror_view(*d_b);
*hm_c = create_mirror_view(*d_c);
}
template <class T>
KOKKOSStream<T>::~KOKKOSStream()
{
finalize();
}
template <class T>
void KOKKOSStream<T>::write_arrays(
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
for(int ii = 0; ii < array_size; ++ii)
{
(*hm_a)(ii) = a[ii];
(*hm_b)(ii) = b[ii];
(*hm_c)(ii) = c[ii];
}
deep_copy(*d_a, *hm_a);
deep_copy(*d_b, *hm_b);
deep_copy(*d_c, *hm_c);
}
template <class T>
void KOKKOSStream<T>::read_arrays(
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
deep_copy(*hm_a, *d_a);
deep_copy(*hm_b, *d_b);
deep_copy(*hm_c, *d_c);
for(int ii = 0; ii < array_size; ++ii)
{
a[ii] = (*hm_a)(ii);
b[ii] = (*hm_b)(ii);
c[ii] = (*hm_c)(ii);
}
}
template <class T>
void KOKKOSStream<T>::copy()
{
View<double*, DEVICE> a(*d_a);
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
{
c[index] = a[index];
});
Kokkos::fence();
}
template <class T>
void KOKKOSStream<T>::mul()
{
View<double*, DEVICE> a(*d_a);
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
const T scalar = 0.3;
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
{
b[index] = scalar*c[index];
});
Kokkos::fence();
}
template <class T>
void KOKKOSStream<T>::add()
{
View<double*, DEVICE> a(*d_a);
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
{
c[index] = a[index] + b[index];
});
Kokkos::fence();
}
template <class T>
void KOKKOSStream<T>::triad()
{
View<double*, DEVICE> a(*d_a);
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
const T scalar = 0.3;
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
{
a[index] = b[index] + scalar*c[index];
});
Kokkos::fence();
}
void listDevices(void)
{
std::cout << "This is not the device you are looking for.";
}
std::string getDeviceName(const int device)
{
return "Kokkos";
}
std::string getDeviceDriver(const int device)
{
return "Kokkos";
}
//template class KOKKOSStream<float>;
template class KOKKOSStream<double>;

56
KOKKOSStream.hpp Normal file
View File

@ -0,0 +1,56 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include <Kokkos_Core.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_View.hpp>
#include "Stream.h"
#define IMPLEMENTATION_STRING "KOKKOS"
#ifdef KOKKOS_TARGET_CPU
#define DEVICE Kokkos::OpenMP
#else
#define DEVICE Kokkos::Cuda
#endif
template <class T>
class KOKKOSStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers to arrays
Kokkos::View<double*, DEVICE>* d_a;
Kokkos::View<double*, DEVICE>* d_b;
Kokkos::View<double*, DEVICE>* d_c;
Kokkos::View<double*>::HostMirror* hm_a;
Kokkos::View<double*>::HostMirror* hm_b;
Kokkos::View<double*>::HostMirror* hm_c;
public:
KOKKOSStream(const unsigned int, const int);
~KOKKOSStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

11
KokkosMakefile Normal file
View File

@ -0,0 +1,11 @@
default: gpu-stream-kokkos
include $(KOKKOS_PATH)/Makefile.kokkos
gpu-stream-kokkos: main.o KOKKOSStream.o
$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS

View File

@ -1,6 +1,6 @@
*==============================================================================
*------------------------------------------------------------------------------
* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Copyright 2015-16: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Based on John D. McCalpins original STREAM benchmark for CPUs
*------------------------------------------------------------------------------
* License:

View File

@ -1,41 +0,0 @@
LDLIBS = -l OpenCL
CXXFLAGS = -std=c++11 -O3
PLATFORM = $(shell uname -s)
ifeq ($(PLATFORM), Darwin)
LDLIBS = -framework OpenCL
endif
all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip
gpu-stream-ocl: ocl-stream.cpp common.o Makefile
$(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS)
common.o: common.cpp common.h Makefile
gpu-stream-cuda: cuda-stream.cu common.o Makefile
ifeq ($(shell which nvcc > /dev/null; echo $$?), 0)
nvcc $(CXXFLAGS) common.o $< -o $@
else
$(error "Cannot find nvcc, please install CUDA toolkit")
endif
HIP_PATH?=../../..
HIPCC=$(HIP_PATH)/bin/hipcc
hip-stream.o : hip-stream.cpp
$(HIPCC) $(CXXFLAGS) -c $< -o $@
gpu-stream-hip: hip-stream.o common.o Makefile
ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0)
$(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@
else
$(error "Cannot find $(HIPCC), please install HIP toolkit")
endif
.PHONY: clean
clean:
rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o

255
OCLStream.cpp Normal file
View File

@ -0,0 +1,255 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "OCLStream.h"
// Cache list of devices
bool cached = false;
std::vector<cl::Device> devices;
void getDeviceList(void);
std::string kernels{R"CLC(
constant TYPE scalar = 0.3;
kernel void copy(
global const TYPE * restrict a,
global TYPE * restrict c)
{
const size_t i = get_global_id(0);
c[i] = a[i];
}
kernel void mul(
global TYPE * restrict b,
global const TYPE * restrict c)
{
const size_t i = get_global_id(0);
b[i] = scalar * c[i];
}
kernel void add(
global const TYPE * restrict a,
global const TYPE * restrict b,
global TYPE * restrict c)
{
const size_t i = get_global_id(0);
c[i] = a[i] + b[i];
}
kernel void triad(
global TYPE * restrict a,
global const TYPE * restrict b,
global const TYPE * restrict c)
{
const size_t i = get_global_id(0);
a[i] = b[i] + scalar * c[i];
}
)CLC"};
template <class T>
OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
{
if (!cached)
getDeviceList();
// Setup default OpenCL GPU
if (device_index >= devices.size())
throw std::runtime_error("Invalid device index");
device = devices[device_index];
// Print out device information
std::cout << "Using OpenCL device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
context = cl::Context(device);
queue = cl::CommandQueue(context);
// Create program
cl::Program program(context, kernels);
if (sizeof(T) == sizeof(double))
{
// Check device can do double
if (!device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
throw std::runtime_error("Device does not support double precision, please use --float");
program.build("-DTYPE=double");
}
else if (sizeof(T) == sizeof(float))
program.build("-DTYPE=float");
// Create kernels
copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
mul_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "mul");
add_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "add");
triad_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer>(program, "triad");
array_size = ARRAY_SIZE;
// Check buffers fit on the device
cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
if (maxbuffer < sizeof(T)*ARRAY_SIZE)
throw std::runtime_error("Device cannot allocate a buffer big enough");
if (totalmem < 3*sizeof(T)*ARRAY_SIZE)
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create buffers
d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE);
}
template <class T>
OCLStream<T>::~OCLStream()
{
delete copy_kernel;
delete mul_kernel;
delete add_kernel;
delete triad_kernel;
}
template <class T>
void OCLStream<T>::copy()
{
(*copy_kernel)(
cl::EnqueueArgs(queue, cl::NDRange(array_size)),
d_a, d_c
);
queue.finish();
}
template <class T>
void OCLStream<T>::mul()
{
(*mul_kernel)(
cl::EnqueueArgs(queue, cl::NDRange(array_size)),
d_b, d_c
);
queue.finish();
}
template <class T>
void OCLStream<T>::add()
{
(*add_kernel)(
cl::EnqueueArgs(queue, cl::NDRange(array_size)),
d_a, d_b, d_c
);
queue.finish();
}
template <class T>
void OCLStream<T>::triad()
{
(*triad_kernel)(
cl::EnqueueArgs(queue, cl::NDRange(array_size)),
d_a, d_b, d_c
);
queue.finish();
}
template <class T>
void OCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
cl::copy(queue, a.begin(), a.end(), d_a);
cl::copy(queue, b.begin(), b.end(), d_b);
cl::copy(queue, c.begin(), c.end(), d_c);
}
template <class T>
void OCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
cl::copy(queue, d_a, a.begin(), a.end());
cl::copy(queue, d_b, b.begin(), b.end());
cl::copy(queue, d_c, c.begin(), c.end());
}
void getDeviceList(void)
{
// Get list of platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
// Enumerate devices
for (unsigned i = 0; i < platforms.size(); i++)
{
std::vector<cl::Device> plat_devices;
platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
}
cached = true;
}
void listDevices(void)
{
getDeviceList();
// Print device names
if (devices.size() == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < devices.size(); i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
if (!cached)
getDeviceList();
std::string name;
cl_device_info info = CL_DEVICE_NAME;
if (device < devices.size())
{
devices[device].getInfo(info, &name);
}
else
{
throw std::runtime_error("Error asking for name for non-existant device");
}
return name;
}
std::string getDeviceDriver(const int device)
{
if (!cached)
getDeviceList();
std::string driver;
if (device < devices.size())
{
devices[device].getInfo(CL_DRIVER_VERSION, &driver);
}
else
{
throw std::runtime_error("Error asking for driver for non-existant device");
}
return driver;
}
template class OCLStream<float>;
template class OCLStream<double>;

61
OCLStream.h Normal file
View File

@ -0,0 +1,61 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#include "CL/cl2.hpp"
#include "Stream.h"
#define IMPLEMENTATION_STRING "OpenCL"
template <class T>
class OCLStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers to arrays
cl::Buffer d_a;
cl::Buffer d_b;
cl::Buffer d_c;
// OpenCL objects
cl::Device device;
cl::Context context;
cl::CommandQueue queue;
cl::KernelFunctor<cl::Buffer, cl::Buffer> *copy_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer> * mul_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *add_kernel;
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer> *triad_kernel;
public:
OCLStream(const unsigned int, const int);
~OCLStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};
// Populate the devices list
void getDeviceList(void);

111
OMP3Stream.cpp Normal file
View File

@ -0,0 +1,111 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "OMP3Stream.h"
template <class T>
OMP3Stream<T>::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
{
array_size = ARRAY_SIZE;
this->a = (T*)malloc(sizeof(T)*array_size);
this->b = (T*)malloc(sizeof(T)*array_size);
this->c = (T*)malloc(sizeof(T)*array_size);
}
template <class T>
OMP3Stream<T>::~OMP3Stream()
{
free(a);
free(b);
free(c);
}
template <class T>
void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
a[i] = h_a[i];
b[i] = h_b[i];
c[i] = h_c[i];
}
}
template <class T>
void OMP3Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
h_a[i] = a[i];
h_b[i] = b[i];
h_c[i] = c[i];
}
}
template <class T>
void OMP3Stream<T>::copy()
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
c[i] = a[i];
}
}
template <class T>
void OMP3Stream<T>::mul()
{
const T scalar = 0.3;
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
b[i] = scalar * c[i];
}
}
template <class T>
void OMP3Stream<T>::add()
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
c[i] = a[i] + b[i];
}
}
template <class T>
void OMP3Stream<T>::triad()
{
const T scalar = 0.3;
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
a[i] = b[i] + scalar * c[i];
}
}
void listDevices(void)
{
std::cout << "0: CPU" << std::endl;
}
std::string getDeviceName(const int)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class OMP3Stream<float>;
template class OMP3Stream<double>;

40
OMP3Stream.h Normal file
View File

@ -0,0 +1,40 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Reference OpenMP"
template <class T>
class OMP3Stream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers
T *a;
T *b;
T *c;
public:
OMP3Stream(const unsigned int, T*, T*, T*);
~OMP3Stream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

140
OMP45Stream.cpp Normal file
View File

@ -0,0 +1,140 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "OMP45Stream.h"
template <class T>
OMP45Stream<T>::OMP45Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
{
omp_set_default_device(device);
array_size = ARRAY_SIZE;
// Set up data region on device
this->a = a;
this->b = b;
this->c = c;
#pragma omp target enter data map(to: a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
OMP45Stream<T>::~OMP45Stream()
{
// End data region on device
unsigned int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target exit data map(release: a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void OMP45Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target update to(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void OMP45Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void OMP45Stream<T>::copy()
{
unsigned int array_size = this->array_size;
T *a = this->a;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], c[0:array_size])
for (int i = 0; i < array_size; i++)
{
c[i] = a[i];
}
}
template <class T>
void OMP45Stream<T>::mul()
{
const T scalar = 0.3;
unsigned int array_size = this->array_size;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: b[0:array_size], c[0:array_size])
for (int i = 0; i < array_size; i++)
{
b[i] = scalar * c[i];
}
}
template <class T>
void OMP45Stream<T>::add()
{
unsigned int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
for (int i = 0; i < array_size; i++)
{
c[i] = a[i] + b[i];
}
}
template <class T>
void OMP45Stream<T>::triad()
{
const T scalar = 0.3;
unsigned int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma omp target teams distribute parallel for simd map(to: a[0:array_size], b[0:array_size], c[0:array_size])
for (int i = 0; i < array_size; i++)
{
a[i] = b[i] + scalar * c[i];
}
}
void listDevices(void)
{
// Get number of devices
int count = omp_get_num_devices();
// Print device list
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << "There are " << count << " devices." << std::endl;
}
}
std::string getDeviceName(const int)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class OMP45Stream<float>;
template class OMP45Stream<double>;

45
OMP45Stream.h Normal file
View File

@ -0,0 +1,45 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#include <omp.h>
#define IMPLEMENTATION_STRING "OpenMP 4.5"
template <class T>
class OMP45Stream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers
T *a;
T *b;
T *c;
public:
OMP45Stream(const unsigned int, T*, T*, T*, int);
~OMP45Stream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

130
RAJAStream.cpp Normal file
View File

@ -0,0 +1,130 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "RAJAStream.hpp"
using RAJA::forall;
using RAJA::RangeSegment;
template <class T>
RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
: array_size(ARRAY_SIZE)
{
RangeSegment seg(0, ARRAY_SIZE);
index_set.push_back(seg);
#ifdef RAJA_TARGET_CPU
d_a = new T[ARRAY_SIZE];
d_b = new T[ARRAY_SIZE];
d_c = new T[ARRAY_SIZE];
#else
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
cudaDeviceSynchronize();
#endif
}
template <class T>
RAJAStream<T>::~RAJAStream()
{
#ifdef RAJA_TARGET_CPU
delete[] d_a;
delete[] d_b;
delete[] d_c;
#else
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
#endif
}
template <class T>
void RAJAStream<T>::write_arrays(
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
std::copy(a.begin(), a.end(), d_a);
std::copy(b.begin(), b.end(), d_b);
std::copy(c.begin(), c.end(), d_c);
}
template <class T>
void RAJAStream<T>::read_arrays(
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
std::copy(d_a, d_a + array_size, a.data());
std::copy(d_b, d_b + array_size, b.data());
std::copy(d_c, d_c + array_size, c.data());
}
template <class T>
void RAJAStream<T>::copy()
{
T* a = d_a;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
{
c[index] = a[index];
});
}
template <class T>
void RAJAStream<T>::mul()
{
T* b = d_b;
T* c = d_c;
const T scalar = 0.3;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
{
b[index] = scalar*c[index];
});
}
template <class T>
void RAJAStream<T>::add()
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
{
c[index] = a[index] + b[index];
});
}
template <class T>
void RAJAStream<T>::triad()
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
const T scalar = 0.3;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
{
a[index] = b[index] + scalar*c[index];
});
}
void listDevices(void)
{
std::cout << "This is not the device you are looking for.";
}
std::string getDeviceName(const int device)
{
return "RAJA";
}
std::string getDeviceDriver(const int device)
{
return "RAJA";
}
template class RAJAStream<float>;
template class RAJAStream<double>;

58
RAJAStream.hpp Normal file
View File

@ -0,0 +1,58 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include "RAJA/RAJA.hxx"
#include "Stream.h"
#define IMPLEMENTATION_STRING "RAJA"
#ifdef RAJA_TARGET_CPU
typedef RAJA::IndexSet::ExecPolicy<
RAJA::seq_segit,
RAJA::omp_parallel_for_exec> policy;
#else
const size_t block_size = 128;
typedef RAJA::IndexSet::ExecPolicy<
RAJA::seq_segit,
RAJA::cuda_exec<block_size>> policy;
#endif
template <class T>
class RAJAStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Contains iteration space
RAJA::IndexSet index_set;
// Device side pointers to arrays
T* d_a;
T* d_b;
T* d_c;
public:
RAJAStream(const unsigned int, const int);
~RAJAStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(
std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -6,24 +6,26 @@ This benchmark is similar in spirit, and based on, the STREAM benchmark [1] for
Unlike other GPU memory bandwidth benchmarks this does *not* include the PCIe transfer time.
There are multiple implementations of this benchmark in a variety of programming models.
Currently implemented are:
- OpenCL
- CUDA
- OpenACC
- OpenMP 3 and 4.5
- Kokkos
- RAJA
- SYCL
Usage
-----
Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6.5)
CMake 3.2 or above is required.
Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda`
Generate the Makefile with `cmake .`
For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
Install the HCC compiler:
https://bitbucket.org/multicoreware/hcc/wiki/Home
Install HIP:
https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
Android
-------
Android (outdated instructions)
------------------
Assuming you have a recent Android NDK available, you can use the
toolchain that it provides to build GPU-STREAM. You should first
@ -57,6 +59,7 @@ Run GPU-STREAM from an adb shell:
# Use float if device doesn't support double, and reduce array size
./gpu-stream-ocl --float -n 6 -s 10000000
Results
-------

231
SYCLStream.cpp Normal file
View File

@ -0,0 +1,231 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "SYCLStream.h"
#include <iostream>
using namespace cl::sycl;
#define WGSIZE 64
// Cache list of devices
bool cached = false;
std::vector<device> devices;
void getDeviceList(void);
template <class T>
SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
{
if (!cached)
getDeviceList();
// The array size must be divisible by WGSIZE
if (ARRAY_SIZE % WGSIZE != 0)
{
std::stringstream ss;
ss << "Array size must be a multiple of " << WGSIZE;
throw std::runtime_error(ss.str());
}
array_size = ARRAY_SIZE;
if (device_index >= devices.size())
throw std::runtime_error("Invalid device index");
device dev = devices[device_index];
// Print out device information
std::cout << "Using SYCL device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
queue = new cl::sycl::queue(dev);
// Create buffers
d_a = new buffer<T>(array_size);
d_b = new buffer<T>(array_size);
d_c = new buffer<T>(array_size);
}
template <class T>
SYCLStream<T>::~SYCLStream()
{
delete d_a;
delete d_b;
delete d_c;
delete queue;
}
template <class T>
void SYCLStream<T>::copy()
{
queue->submit([&](handler &cgh)
{
auto ka = d_a->template get_access<access::mode::read>(cgh);
auto kc = d_c->template get_access<access::mode::write>(cgh);
cgh.parallel_for<class copy>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
{
kc[item.get_global()] = ka[item.get_global()];
});
});
queue->wait();
}
template <class T>
void SYCLStream<T>::mul()
{
const T scalar = 0.3;
queue->submit([&](handler &cgh)
{
auto kb = d_b->template get_access<access::mode::write>(cgh);
auto kc = d_c->template get_access<access::mode::read>(cgh);
cgh.parallel_for<class mul>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
{
kb[item.get_global()] = scalar * kc[item.get_global()];
});
});
queue->wait();
}
template <class T>
void SYCLStream<T>::add()
{
queue->submit([&](handler &cgh)
{
auto ka = d_a->template get_access<access::mode::read>(cgh);
auto kb = d_b->template get_access<access::mode::read>(cgh);
auto kc = d_c->template get_access<access::mode::write>(cgh);
cgh.parallel_for<class add>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
{
kc[item.get_global()] = ka[item.get_global()] + kb[item.get_global()];
});
});
queue->wait();
}
template <class T>
void SYCLStream<T>::triad()
{
const T scalar = 0.3;
queue->submit([&](handler &cgh)
{
auto ka = d_a->template get_access<access::mode::write>(cgh);
auto kb = d_b->template get_access<access::mode::read>(cgh);
auto kc = d_c->template get_access<access::mode::read>(cgh);
cgh.parallel_for<class triad>(nd_range<1>{array_size, WGSIZE}, [=](nd_item<1> item)
{
ka[item.get_global()] = kb[item.get_global()] + scalar * kc[item.get_global()];
});
});
queue->wait();
}
template <class T>
void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
auto _a = d_a->template get_access<access::mode::write, access::target::host_buffer>();
auto _b = d_b->template get_access<access::mode::write, access::target::host_buffer>();
auto _c = d_c->template get_access<access::mode::write, access::target::host_buffer>();
for (int i = 0; i < array_size; i++)
{
_a[i] = a[i];
_b[i] = b[i];
_c[i] = c[i];
}
}
template <class T>
void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
auto _a = d_a->template get_access<access::mode::read, access::target::host_buffer>();
auto _b = d_b->template get_access<access::mode::read, access::target::host_buffer>();
auto _c = d_c->template get_access<access::mode::read, access::target::host_buffer>();
for (int i = 0; i < array_size; i++)
{
a[i] = _a[i];
b[i] = _b[i];
c[i] = _c[i];
}
}
void getDeviceList(void)
{
// Get list of platforms
std::vector<platform> platforms = platform::get_platforms();
// Enumerate devices
for (unsigned i = 0; i < platforms.size(); i++)
{
std::vector<device> plat_devices = platforms[i].get_devices();
devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
}
cached = true;
}
void listDevices(void)
{
getDeviceList();
// Print device names
if (devices.size() == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < devices.size(); i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
if (!cached)
getDeviceList();
std::string name;
if (device < devices.size())
{
name = devices[device].get_info<info::device::name>();
}
else
{
throw std::runtime_error("Error asking for name for non-existant device");
}
return name;
}
std::string getDeviceDriver(const int device)
{
if (!cached)
getDeviceList();
std::string driver;
if (device < devices.size())
{
driver = devices[device].get_info<info::device::driver_version>();
}
else
{
throw std::runtime_error("Error asking for driver for non-existant device");
}
return driver;
}
// TODO: Fix kernel names to allow multiple template specializations
//template class SYCLStream<float>;
template class SYCLStream<double>;

47
SYCLStream.h Normal file
View File

@ -0,0 +1,47 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <sstream>
#include "Stream.h"
#include "CL/sycl.hpp"
#define IMPLEMENTATION_STRING "SYCL"
template <class T>
class SYCLStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// SYCL objects
cl::sycl::queue *queue;
cl::sycl::buffer<T> *d_a;
cl::sycl::buffer<T> *d_b;
cl::sycl::buffer<T> *d_c;
public:
SYCLStream(const unsigned int, const int);
~SYCLStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};
// Populate the devices list
void getDeviceList(void);

37
Stream.h Normal file
View File

@ -0,0 +1,37 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <vector>
#include <string>
template <class T>
class Stream
{
public:
virtual ~Stream(){}
// Kernels
// These must be blocking calls
virtual void copy() = 0;
virtual void mul() = 0;
virtual void add() = 0;
virtual void triad() = 0;
// Copy memory between host and device
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) = 0;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
};
// Implementation specific device functions
void listDevices(void);
std::string getDeviceName(const int);
std::string getDeviceDriver(const int);

View File

@ -1,165 +0,0 @@
/*=============================================================================
*------------------------------------------------------------------------------
* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Based on John D. McCalpins original STREAM benchmark for CPUs
*------------------------------------------------------------------------------
* License:
* 1. You are free to use this program and/or to redistribute
* this program.
* 2. You are free to modify this program for your own use,
* including commercial use, subject to the publication
* restrictions in item 3.
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.
* 5. Absolutely no warranty is expressed or implied.
*-----------------------------------------*/
#include "common.h"
// Default array size 50 * 2^20 (50*8 Mebibytes double precision)
// Use binary powers of two so divides 1024
unsigned int ARRAY_SIZE = 52428800;
size_t ARRAY_PAD_BYTES = 0;
unsigned int NTIMES = 10;
bool useFloat = false;
unsigned int groups = 0;
unsigned int groupSize = 1024;
unsigned int deviceIndex = 0;
int parseUInt(const char *str, unsigned int *output)
{
char *next;
*output = strtoul(str, &next, 10);
return !strlen(next);
}
int parseSize(const char *str, size_t *output)
{
char *next;
*output = strtoull(str, &next, 0);
int l = strlen(str);
if (l) {
char c = str[l-1]; // last char.
if ((c == 'k') || (c == 'K')) {
*output *= 1024;
}
if ((c == 'm') || (c == 'M')) {
*output *= (1024*1024);
}
}
return !strlen(next);
}
void parseArguments(int argc, char *argv[])
{
for (int i = 1; i < argc; i++)
{
if (!strcmp(argv[i], "--list"))
{
listDevices();
exit(0);
}
else if (!strcmp(argv[i], "--device"))
{
if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
{
std::cout << "Invalid device index" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--arraysize") || !strcmp(argv[i], "-s"))
{
if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
{
std::cout << "Invalid array size" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--numtimes") || !strcmp(argv[i], "-n"))
{
if (++i >= argc || !parseUInt(argv[i], &NTIMES))
{
std::cout << "Invalid number of times" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--groups"))
{
if (++i >= argc || !parseUInt(argv[i], &groups))
{
std::cout << "Invalid group number" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--groupSize"))
{
if (++i >= argc || !parseUInt(argv[i], &groupSize))
{
std::cout << "Invalid group size" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--pad"))
{
if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
{
std::cout << "Invalid size" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--float"))
{
useFloat = true;
std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision, not apply to AMD device" << std::endl;
}
else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
{
std::cout << std::endl;
std::cout << "Usage: ./gpu-stream-cuda [OPTIONS]" << std::endl << std::endl;
std::cout << "Options:" << std::endl;
std::cout << " -h --help Print the message" << std::endl;
std::cout << " --list List available devices" << std::endl;
std::cout << " --device INDEX Select device at INDEX" << std::endl;
std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl;
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl;
std::cout << " --groupSize Set size of each group (default 1024)" << std::endl;
std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
std::cout << " --float Use floats (rather than doubles)" << std::endl;
std::cout << std::endl;
exit(0);
}
else
{
std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')"
<< std::endl;
exit(1);
}
}
}

115
common.h
View File

@ -1,115 +0,0 @@
/*=============================================================================
*------------------------------------------------------------------------------
* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Based on John D. McCalpins original STREAM benchmark for CPUs
*------------------------------------------------------------------------------
* License:
* 1. You are free to use this program and/or to redistribute
* this program.
* 2. You are free to modify this program for your own use,
* including commercial use, subject to the publication
* restrictions in item 3.
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.
* 5. Absolutely no warranty is expressed or implied.
*-----------------------------------------*/
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <stdexcept>
#define VERSION_STRING "1.0"
extern void parseArguments(int argc, char *argv[]);
extern void listDevices(void);
extern unsigned int ARRAY_SIZE;
extern size_t ARRAY_PAD_BYTES;
extern unsigned int NTIMES;
extern unsigned int groups;
extern unsigned int groupSize;
extern bool useFloat;
extern unsigned int deviceIndex;
template < typename T >
void check_solution(void* a_in, void* b_in, void* c_in)
{
// Generate correct solution
T golda = 1.0;
T goldb = 2.0;
T goldc = 0.0;
T * a = static_cast<T*>(a_in);
T * b = static_cast<T*>(b_in);
T * c = static_cast<T*>(c_in);
const T scalar = 3.0;
for (unsigned int i = 0; i < NTIMES; i++)
{
// Double
goldc = golda;
goldb = scalar * goldc;
goldc = golda + goldb;
golda = goldb + scalar * goldc;
}
// Calculate average error
double erra = 0.0;
double errb = 0.0;
double errc = 0.0;
for (unsigned int i = 0; i < ARRAY_SIZE; i++)
{
erra += fabs(a[i] - golda);
errb += fabs(b[i] - goldb);
errc += fabs(c[i] - goldc);
}
erra /= ARRAY_SIZE;
errb /= ARRAY_SIZE;
errc /= ARRAY_SIZE;
double epsi = std::numeric_limits<T>::epsilon() * 100;
if (erra > epsi)
std::cout
<< "Validation failed on a[]. Average error " << erra
<< std::endl;
if (errb > epsi)
std::cout
<< "Validation failed on b[]. Average error " << errb
<< std::endl;
if (errc > epsi)
std::cout
<< "Validation failed on c[]. Average error " << errc
<< std::endl;
}

9
common.h.in Normal file
View File

@ -0,0 +1,9 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#define VERSION_STRING "@gpu-stream_VERSION_MAJOR@.@gpu-stream_VERSION_MINOR@"

View File

@ -1,507 +0,0 @@
/*=============================================================================
*------------------------------------------------------------------------------
* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Based on John D. McCalpins original STREAM benchmark for CPUs
*------------------------------------------------------------------------------
* License:
* 1. You are free to use this program and/or to redistribute
* this program.
* 2. You are free to modify this program for your own use,
* including commercial use, subject to the publication
* restrictions in item 3.
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.
* 5. Absolutely no warranty is expressed or implied.
*———————————————————————————————————-----------------------------------------*/
#include <iostream>
#include <fstream>
#include <vector>
#include <chrono>
#include <cfloat>
#include <cmath>
#include <cuda.h>
#include "common.h"
std::string getDeviceName(int device);
int getDriver(void);
// Code to check CUDA errors
void check_cuda_error(void)
{
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr
<< "Error: "
<< cudaGetErrorString(err)
<< std::endl;
exit(err);
}
}
// looper function place more work inside each work item.
// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
template <typename T>
__global__ void
copy_looper(const T * a, T * c, int ARRAY_SIZE)
{
int offset = (blockDim.x * blockIdx.x + threadIdx.x);
int stride = blockDim.x * gridDim.x;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i];
}
}
template <typename T>
__global__ void
mul_looper(T * b, const T * c, int ARRAY_SIZE)
{
int offset = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
const T scalar = 3.0;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
b[i] = scalar * c[i];
}
}
template <typename T>
__global__ void
add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE)
{
int offset = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i] + b[i];
}
}
template <typename T>
__global__ void
triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE)
{
int offset = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
const T scalar = 3.0;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
a[i] = b[i] + scalar * c[i];
}
}
template <typename T>
__global__ void copy(const T * a, T * c)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i];
}
template <typename T>
__global__ void mul(T * b, const T * c)
{
const T scalar = 3.0;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
b[i] = scalar * c[i];
}
template <typename T>
__global__ void add(const T * a, const T * b, T * c)
{
const int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
template <typename T>
__global__ void triad(T * a, const T * b, const T * c)
{
const T scalar = 3.0;
const int i = blockDim.x * blockIdx.x + threadIdx.x;
a[i] = b[i] + scalar * c[i];
}
int main(int argc, char *argv[])
{
// Print out run information
std::cout
<< "GPU-STREAM" << std::endl
<< "Version: " << VERSION_STRING << std::endl
<< "Implementation: CUDA" << std::endl;
parseArguments(argc, argv);
if (NTIMES < 2)
throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
// Config grid size and group size for kernel launching
int gridSize;
if (groups) {
gridSize = groups * groupSize;
} else {
gridSize = ARRAY_SIZE;
}
float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
if (groups) std::cout << "Using looper kernels:" << std::endl;
std::cout << "Precision: ";
if (useFloat) std::cout << "float";
else std::cout << "double";
std::cout << std::endl << std::endl;
std::cout << "Running kernels " << NTIMES << " times" << std::endl;
if (ARRAY_SIZE % 1024 != 0)
{
unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
ARRAY_SIZE -= ARRAY_SIZE % 1024;
std::cout
<< "Warning: array size must divide 1024" << std::endl
<< "Resizing array from " << OLD_ARRAY_SIZE
<< " to " << ARRAY_SIZE << std::endl;
if (ARRAY_SIZE == 0)
throw std::runtime_error("Array size must be >= 1024");
}
// Get precision (used to reset later)
std::streamsize ss = std::cout.precision();
size_t DATATYPE_SIZE;
if (useFloat)
{
DATATYPE_SIZE = sizeof(float);
}
else
{
DATATYPE_SIZE = sizeof(double);
}
// Display number of bytes in array
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
<< " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
<< std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
<< " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
<< std::endl;
// Reset precision
std::cout.precision(ss);
// Check device index is in range
int count;
cudaGetDeviceCount(&count);
check_cuda_error();
if (deviceIndex >= count)
throw std::runtime_error("Chosen device index is invalid");
cudaSetDevice(deviceIndex);
check_cuda_error();
// Print out device name
std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
// Print out device CUDA driver version
std::cout << "Driver: " << getDriver() << std::endl;
// Check buffers fit on the device
cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceIndex);
if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create host vectors
void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
// Initilise arrays
for (unsigned int i = 0; i < ARRAY_SIZE; i++)
{
if (useFloat)
{
((float*)h_a)[i] = 1.0f;
((float*)h_b)[i] = 2.0f;
((float*)h_c)[i] = 0.0f;
}
else
{
((double*)h_a)[i] = 1.0;
((double*)h_b)[i] = 2.0;
((double*)h_c)[i] = 0.0;
}
}
// Create device buffers
void * d_a, * d_b, *d_c;
cudaMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
check_cuda_error();
cudaMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
check_cuda_error();
cudaMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
check_cuda_error();
// Copy host memory to device
cudaMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
check_cuda_error();
cudaMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
check_cuda_error();
cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
check_cuda_error();
std::cout << "d_a=" << (void*)d_a << std::endl;
std::cout << "d_b=" << (void*)d_b << std::endl;
std::cout << "d_c=" << (void*)d_c << std::endl;
// Make sure the copies are finished
cudaDeviceSynchronize();
check_cuda_error();
// List of times
std::vector< std::vector<double> > timings;
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Main loop
for (unsigned int k = 0; k < NTIMES; k++)
{
std::vector<double> times;
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
copy_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
else
copy_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
else
copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
}
check_cuda_error();
cudaDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
mul_looper<float><<<gridSize,groupSize>>>((float*)d_b, (float*)d_c, ARRAY_SIZE);
else
mul_looper<double><<<gridSize,groupSize>>>((double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
else
mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
}
check_cuda_error();
cudaDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
add_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
else
add_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
else
add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
}
check_cuda_error();
cudaDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
triad_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
else
triad_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
else
triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
}
check_cuda_error();
cudaDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
timings.push_back(times);
}
// Check solutions
cudaMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
check_cuda_error();
cudaMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
check_cuda_error();
cudaMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyDeviceToHost);
check_cuda_error();
if (useFloat)
{
check_solution<float>(h_a, h_b, h_c);
}
else
{
check_solution<double>(h_a, h_b, h_c);
}
// Crunch results
size_t sizes[4] = {
2 * DATATYPE_SIZE * ARRAY_SIZE,
2 * DATATYPE_SIZE * ARRAY_SIZE,
3 * DATATYPE_SIZE * ARRAY_SIZE,
3 * DATATYPE_SIZE * ARRAY_SIZE
};
double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
double max[4] = {0.0, 0.0, 0.0, 0.0};
double avg[4] = {0.0, 0.0, 0.0, 0.0};
// Ignore first result
for (unsigned int i = 1; i < NTIMES; i++)
{
for (int j = 0; j < 4; j++)
{
avg[j] += timings[i][j];
min[j] = std::min(min[j], timings[i][j]);
max[j] = std::max(max[j], timings[i][j]);
}
}
for (int j = 0; j < 4; j++)
avg[j] /= (double)(NTIMES-1);
double geomean = 1.0;
for (int j = 0; j < 4; j++) {
geomean *= (sizes[j]/min[j]);
}
geomean = pow(geomean, 0.25);
// Display results
std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
std::cout
<< std::left << std::setw(12) << "Function"
<< std::left << std::setw(12) << "MBytes/sec"
<< std::left << std::setw(12) << "Min (sec)"
<< std::left << std::setw(12) << "Max"
<< std::left << std::setw(12) << "Average"
<< std::endl;
for (int j = 0; j < 4; j++)
{
std::cout
<< std::left << std::setw(12) << labels[j]
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
<< std::left << std::setw(12) << std::setprecision(5) << min[j]
<< std::left << std::setw(12) << std::setprecision(5) << max[j]
<< std::left << std::setw(12) << std::setprecision(5) << avg[j]
<< std::endl;
}
std::cout
<< std::left << std::setw(12) << "GEOMEAN"
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
<< std::endl;
// Free host vectors
free(h_a);
free(h_b);
free(h_c);
// Free cuda buffers
cudaFree(d_a);
check_cuda_error();
cudaFree(d_b);
check_cuda_error();
cudaFree(d_c);
check_cuda_error();
}
std::string getDeviceName(int device)
{
struct cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device);
check_cuda_error();
return std::string(prop.name);
}
int getDriver(void)
{
int driver;
cudaDriverGetVersion(&driver);
check_cuda_error();
return driver;
}
void listDevices(void)
{
// Get number of devices
int count;
cudaGetDeviceCount(&count);
check_cuda_error();
// Print device names
if (count == 0)
{
std::cout << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
check_cuda_error();
}
std::cout << std::endl;
}
}

View File

@ -1,531 +0,0 @@
#include "hip_runtime.h"
/*=============================================================================
*------------------------------------------------------------------------------
* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Based on John D. McCalpins original STREAM benchmark for CPUs
*------------------------------------------------------------------------------
* License:
* 1. You are free to use this program and/or to redistribute
* this program.
* 2. You are free to modify this program for your own use,
* including commercial use, subject to the publication
* restrictions in item 3.
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.
* 5. Absolutely no warranty is expressed or implied.
*-----------------------------------------*/
#include <iostream>
#include <fstream>
#include <vector>
#include <chrono>
#include <cfloat>
#include <cmath>
//#include <cuda.h>
#include "common.h"
std::string getDeviceName(int device);
int getDriver(void);
// Code to check CUDA errors
void check_cuda_error(void)
{
hipError_t err = hipGetLastError();
if (err != hipSuccess)
{
std::cerr
<< "Error: "
<< hipGetErrorString(err)
<< std::endl;
exit(err);
}
}
// looper function place more work inside each work item.
// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
template <typename T>
__global__ void
copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE)
{
int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i];
}
}
template <typename T>
__global__ void
mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE)
{
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int stride = hipBlockDim_x * hipGridDim_x;
const T scalar = 3.0;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
b[i] = scalar * c[i];
}
}
template <typename T>
__global__ void
add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE)
{
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int stride = hipBlockDim_x * hipGridDim_x;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i] + b[i];
}
}
template <typename T>
__global__ void
triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE)
{
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int stride = hipBlockDim_x * hipGridDim_x;
const T scalar = 3.0;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
a[i] = b[i] + scalar * c[i];
}
}
template <typename T>
__global__ void
copy(hipLaunchParm lp, const T * a, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
c[i] = a[i];
}
template <typename T>
__global__ void
mul(hipLaunchParm lp, T * b, const T * c)
{
const T scalar = 3.0;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
b[i] = scalar * c[i];
}
template <typename T>
__global__ void
add(hipLaunchParm lp, const T * a, const T * b, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
c[i] = a[i] + b[i];
}
template <typename T>
__global__ void
triad(hipLaunchParm lp, T * a, const T * b, const T * c)
{
const T scalar = 3.0;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
a[i] = b[i] + scalar * c[i];
}
int main(int argc, char *argv[])
{
// Print out run information
std::cout
<< "GPU-STREAM" << std::endl
<< "Version: " << VERSION_STRING << std::endl
<< "Implementation: HIP" << std::endl;
parseArguments(argc, argv);
if (NTIMES < 2)
throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
// Config grid size and group size for kernel launching
int gridSize;
if (groups) {
gridSize = groups * groupSize;
} else {
gridSize = ARRAY_SIZE;
}
float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
if (groups) std::cout << "Using looper kernels:" << std::endl;
std::cout << "Precision: ";
if (useFloat) std::cout << "float";
else std::cout << "double";
std::cout << std::endl << std::endl;
std::cout << "Running kernels " << NTIMES << " times" << std::endl;
if (ARRAY_SIZE % 1024 != 0)
{
unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
ARRAY_SIZE -= ARRAY_SIZE % 1024;
std::cout
<< "Warning: array size must divide 1024" << std::endl
<< "Resizing array from " << OLD_ARRAY_SIZE
<< " to " << ARRAY_SIZE << std::endl;
if (ARRAY_SIZE == 0)
throw std::runtime_error("Array size must be >= 1024");
}
// Get precision (used to reset later)
std::streamsize ss = std::cout.precision();
size_t DATATYPE_SIZE;
if (useFloat)
{
DATATYPE_SIZE = sizeof(float);
}
else
{
DATATYPE_SIZE = sizeof(double);
}
// Display number of bytes in array
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
<< " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
<< " " << ARRAY_PAD_BYTES << " bytes padding"
<< std::endl;
std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
<< " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
<< std::endl;
// Reset precision
std::cout.precision(ss);
// Check device index is in range
int count;
hipGetDeviceCount(&count);
check_cuda_error();
if (deviceIndex >= count)
throw std::runtime_error("Chosen device index is invalid");
hipSetDevice(deviceIndex);
check_cuda_error();
hipDeviceProp_t props;
hipGetDeviceProperties(&props, deviceIndex);
// Print out device name
std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
// Print out device HIP driver version
std::cout << "Driver: " << getDriver() << std::endl;
// Check buffers fit on the device
if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
//int cus = props.multiProcessorCount;
// Create host vectors
void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
// Initialise arrays
for (unsigned int i = 0; i < ARRAY_SIZE; i++)
{
if (useFloat)
{
((float*)h_a)[i] = 1.0f;
((float*)h_b)[i] = 2.0f;
((float*)h_c)[i] = 0.0f;
}
else
{
((double*)h_a)[i] = 1.0;
((double*)h_b)[i] = 2.0;
((double*)h_c)[i] = 0.0;
}
}
// Create device buffers
char * d_a, * d_b, *d_c;
hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
check_cuda_error();
hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
d_b += ARRAY_PAD_BYTES;
check_cuda_error();
hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
d_c += ARRAY_PAD_BYTES;
check_cuda_error();
// Copy host memory to device
hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
check_cuda_error();
hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
check_cuda_error();
hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
check_cuda_error();
std::cout << "d_a=" << (void*)d_a << std::endl;
std::cout << "d_b=" << (void*)d_b << std::endl;
std::cout << "d_c=" << (void*)d_c << std::endl;
// Make sure the copies are finished
hipDeviceSynchronize();
check_cuda_error();
// List of times
std::vector< std::vector<double> > timings;
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Main loop
for (unsigned int k = 0; k < NTIMES; k++)
{
std::vector<double> times;
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
else
hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
}
check_cuda_error();
hipDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
else
hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
}
check_cuda_error();
hipDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
else
hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
}
check_cuda_error();
hipDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
t1 = std::chrono::high_resolution_clock::now();
if (groups) {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
else
hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
}
check_cuda_error();
hipDeviceSynchronize();
check_cuda_error();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
timings.push_back(times);
}
// Check solutions
hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
check_cuda_error();
hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
check_cuda_error();
hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost);
check_cuda_error();
if (useFloat)
{
check_solution<float>(h_a, h_b, h_c);
}
else
{
check_solution<double>(h_a, h_b, h_c);
}
// Crunch results
size_t sizes[4] = {
2 * DATATYPE_SIZE * ARRAY_SIZE,
2 * DATATYPE_SIZE * ARRAY_SIZE,
3 * DATATYPE_SIZE * ARRAY_SIZE,
3 * DATATYPE_SIZE * ARRAY_SIZE
};
double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
double max[4] = {0.0, 0.0, 0.0, 0.0};
double avg[4] = {0.0, 0.0, 0.0, 0.0};
// Ignore first result
for (unsigned int i = 1; i < NTIMES; i++)
{
for (int j = 0; j < 4; j++)
{
avg[j] += timings[i][j];
min[j] = std::min(min[j], timings[i][j]);
max[j] = std::max(max[j], timings[i][j]);
}
}
for (int j = 0; j < 4; j++) {
avg[j] /= (double)(NTIMES-1);
}
double geomean = 1.0;
for (int j = 0; j < 4; j++) {
geomean *= (sizes[j]/min[j]);
}
geomean = pow(geomean, 0.25);
// Display results
std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
std::cout
<< std::left << std::setw(12) << "Function"
<< std::left << std::setw(12) << "MBytes/sec"
<< std::left << std::setw(12) << "Min (sec)"
<< std::left << std::setw(12) << "Max"
<< std::left << std::setw(12) << "Average"
<< std::endl;
for (int j = 0; j < 4; j++)
{
std::cout
<< std::left << std::setw(12) << labels[j]
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
<< std::left << std::setw(12) << std::setprecision(5) << min[j]
<< std::left << std::setw(12) << std::setprecision(5) << max[j]
<< std::left << std::setw(12) << std::setprecision(5) << avg[j]
<< std::endl;
}
std::cout
<< std::left << std::setw(12) << "GEOMEAN"
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
<< std::endl;
// Free host vectors
free(h_a);
free(h_b);
free(h_c);
// Free cuda buffers
hipFree(d_a);
check_cuda_error();
hipFree(d_b);
check_cuda_error();
hipFree(d_c);
check_cuda_error();
}
std::string getDeviceName(int device)
{
struct hipDeviceProp_t prop;
hipGetDeviceProperties(&prop, device);
check_cuda_error();
return std::string(prop.name);
}
int getDriver(void)
{
int driver;
hipDriverGetVersion(&driver);
check_cuda_error();
return driver;
}
void listDevices(void)
{
// Get number of devices
int count;
hipGetDeviceCount(&count);
check_cuda_error();
// Print device names
if (count == 0)
{
std::cout << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
check_cuda_error();
}
std::cout << std::endl;
}
}

332
main.cpp Normal file
View File

@ -0,0 +1,332 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include <iostream>
#include <vector>
#include <numeric>
#include <cmath>
#include <limits>
#include <chrono>
#include <algorithm>
#include <iomanip>
#include <cstring>
#include "common.h"
#include "Stream.h"
#if defined(CUDA)
#include "CUDAStream.h"
#elif defined(OCL)
#include "OCLStream.h"
#elif defined(USE_RAJA)
#include "RAJAStream.hpp"
#elif defined(KOKKOS)
#include "KOKKOSStream.hpp"
#elif defined(ACC)
#include "ACCStream.h"
#elif defined(SYCL)
#include "SYCLStream.h"
#elif defined(OMP3)
#include "OMP3Stream.h"
#elif defined(OMP45)
#include "OMP45Stream.h"
#endif
// Default size of 2^25
unsigned int ARRAY_SIZE = 33554432;
unsigned int num_times = 100;
unsigned int deviceIndex = 0;
bool use_float = false;
template <typename T>
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c);
template <typename T>
void run();
void parseArguments(int argc, char *argv[]);
int main(int argc, char *argv[])
{
std::cout
<< "GPU-STREAM" << std::endl
<< "Version: " << VERSION_STRING << std::endl
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
parseArguments(argc, argv);
// TODO: Fix SYCL to allow multiple template specializations
#ifndef SYCL
#ifndef KOKKOS
if (use_float)
run<float>();
else
#endif
#endif
run<double>();
}
template <typename T>
void run()
{
std::cout << "Running kernels " << num_times << " times" << std::endl;
if (sizeof(T) == sizeof(float))
std::cout << "Precision: float" << std::endl;
else
std::cout << "Precision: double" << std::endl;
// Create host vectors
std::vector<T> a(ARRAY_SIZE, 0.1);
std::vector<T> b(ARRAY_SIZE, 0.2);
std::vector<T> c(ARRAY_SIZE, 0.0);
std::streamsize ss = std::cout.precision();
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
<< " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
std::cout.precision(ss);
Stream<T> *stream;
#if defined(CUDA)
// Use the CUDA implementation
stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(OCL)
// Use the OpenCL implementation
stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(USE_RAJA)
// Use the RAJA implementation
stream = new RAJAStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(KOKKOS)
// Use the Kokkos implementation
stream = new KOKKOSStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC)
// Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
#elif defined(SYCL)
// Use the SYCL implementation
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(OMP3)
// Use the "reference" OpenMP 3 implementation
stream = new OMP3Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
#elif defined(OMP45)
// Use the "reference" OpenMP 3 implementation
stream = new OMP45Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
#endif
stream->write_arrays(a, b, c);
// List of times
std::vector<std::vector<double>> timings(4);
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Main loop
for (unsigned int k = 0; k < num_times; k++)
{
// Execute Copy
t1 = std::chrono::high_resolution_clock::now();
stream->copy();
t2 = std::chrono::high_resolution_clock::now();
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Mul
t1 = std::chrono::high_resolution_clock::now();
stream->mul();
t2 = std::chrono::high_resolution_clock::now();
timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Add
t1 = std::chrono::high_resolution_clock::now();
stream->add();
t2 = std::chrono::high_resolution_clock::now();
timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
// Execute Triad
t1 = std::chrono::high_resolution_clock::now();
stream->triad();
t2 = std::chrono::high_resolution_clock::now();
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
}
// Check solutions
stream->read_arrays(a, b, c);
check_solution<T>(num_times, a, b, c);
// Display timing results
std::cout
<< std::left << std::setw(12) << "Function"
<< std::left << std::setw(12) << "MBytes/sec"
<< std::left << std::setw(12) << "Min (sec)"
<< std::left << std::setw(12) << "Max"
<< std::left << std::setw(12) << "Average" << std::endl;
std::cout << std::fixed;
std::string labels[4] = {"Copy", "Mul", "Add", "Triad"};
size_t sizes[4] = {
2 * sizeof(T) * ARRAY_SIZE,
2 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE,
3 * sizeof(T) * ARRAY_SIZE
};
for (int i = 0; i < 4; i++)
{
// Get min/max; ignore the first result
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
// Calculate average; ignore the first result
double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
// Display results
std::cout
<< std::left << std::setw(12) << labels[i]
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first)
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.first
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.second
<< std::left << std::setw(12) << std::setprecision(5) << average
<< std::endl;
}
delete stream;
}
template <typename T>
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
// Generate correct solution
T goldA = 0.1;
T goldB = 0.2;
T goldC = 0.0;
const T scalar = 0.3;
for (unsigned int i = 0; i < ntimes; i++)
{
// Do STREAM!
goldC = goldA;
goldB = scalar * goldC;
goldC = goldA + goldB;
goldA = goldB + scalar * goldC;
}
// Calculate the average error
double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
errA /= a.size();
double errB = std::accumulate(b.begin(), b.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldB); });
errB /= b.size();
double errC = std::accumulate(c.begin(), c.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldC); });
errC /= c.size();
double epsi = std::numeric_limits<T>::epsilon() * 100.0;
if (errA > epsi)
std::cerr
<< "Validation failed on a[]. Average error " << errA
<< std::endl;
if (errB > epsi)
std::cerr
<< "Validation failed on b[]. Average error " << errB
<< std::endl;
if (errC > epsi)
std::cerr
<< "Validation failed on c[]. Average error " << errC
<< std::endl;
}
int parseUInt(const char *str, unsigned int *output)
{
char *next;
*output = strtoul(str, &next, 10);
return !strlen(next);
}
void parseArguments(int argc, char *argv[])
{
for (int i = 1; i < argc; i++)
{
if (!std::string("--list").compare(argv[i]))
{
listDevices();
exit(EXIT_SUCCESS);
}
else if (!std::string("--device").compare(argv[i]))
{
if (++i >= argc || !parseUInt(argv[i], &deviceIndex))
{
std::cerr << "Invalid device index." << std::endl;
exit(EXIT_FAILURE);
}
}
else if (!std::string("--arraysize").compare(argv[i]) ||
!std::string("-s").compare(argv[i]))
{
if (++i >= argc || !parseUInt(argv[i], &ARRAY_SIZE))
{
std::cerr << "Invalid array size." << std::endl;
exit(EXIT_FAILURE);
}
}
else if (!std::string("--numtimes").compare(argv[i]) ||
!std::string("-n").compare(argv[i]))
{
if (++i >= argc || !parseUInt(argv[i], &num_times))
{
std::cerr << "Invalid number of times." << std::endl;
exit(EXIT_FAILURE);
}
if (num_times < 2)
{
std::cerr << "Number of times must be 2 or more" << std::endl;
exit(EXIT_FAILURE);
}
}
else if (!std::string("--float").compare(argv[i]))
{
use_float = true;
}
else if (!std::string("--help").compare(argv[i]) ||
!std::string("-h").compare(argv[i]))
{
std::cout << std::endl;
std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
std::cout << "Options:" << std::endl;
std::cout << " -h --help Print the message" << std::endl;
std::cout << " --list List available devices" << std::endl;
std::cout << " --device INDEX Select device at INDEX" << std::endl;
std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl;
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
std::cout << " --float Use floats (rather than doubles)" << std::endl;
std::cout << std::endl;
exit(EXIT_SUCCESS);
}
else
{
std::cerr << "Unrecognized argument '" << argv[i] << "' (try '--help')"
<< std::endl;
exit(EXIT_FAILURE);
}
}
}

View File

@ -1,70 +0,0 @@
/*=============================================================================
*------------------------------------------------------------------------------
* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Based on John D. McCalpins original STREAM benchmark for CPUs
*------------------------------------------------------------------------------
* License:
* 1. You are free to use this program and/or to redistribute
* this program.
* 2. You are free to modify this program for your own use,
* including commercial use, subject to the publication
* restrictions in item 3.
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.
* 5. Absolutely no warranty is expressed or implied.
*-----------------------------------------*/
#ifdef FLOAT
#define DATATYPE float
constant DATATYPE scalar = 3.0f;
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define DATATYPE double
constant DATATYPE scalar = 3.0;
#endif
kernel void copy(global const DATATYPE * restrict a, global DATATYPE * restrict c)
{
const size_t i = get_global_id(0);
c[i] = a[i];
}
kernel void mul(global DATATYPE * restrict b, global const DATATYPE * restrict c)
{
const size_t i = get_global_id(0);
b[i] = scalar * c[i];
}
kernel void add(global const DATATYPE * restrict a, global const DATATYPE * restrict b, global DATATYPE * restrict c)
{
const size_t i = get_global_id(0);
c[i] = a[i] + b[i];
}
kernel void triad(global DATATYPE * restrict a, global const DATATYPE * restrict b, global const DATATYPE * restrict c)
{
const size_t i = get_global_id(0);
a[i] = b[i] + scalar * c[i];
}

View File

@ -1,488 +0,0 @@
/*=============================================================================
*------------------------------------------------------------------------------
* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC
* Based on John D. McCalpins original STREAM benchmark for CPUs
*------------------------------------------------------------------------------
* License:
* 1. You are free to use this program and/or to redistribute
* this program.
* 2. You are free to modify this program for your own use,
* including commercial use, subject to the publication
* restrictions in item 3.
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.
* 5. Absolutely no warranty is expressed or implied.
*-----------------------------------------*/
#include <iostream>
#include <fstream>
#include <vector>
#include <chrono>
#include <cfloat>
#include <cmath>
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_MINIMUM_OPENCL_VERSION 110
#define CL_HPP_TARGET_OPENCL_VERSION 110
#include "CL/cl2.hpp"
#include "common.h"
std::string getDeviceName(const cl::Device& device);
std::string getDeviceDriver(const cl::Device& device);
unsigned getDeviceList(std::vector<cl::Device>& devices);
// Print error and exit
void die(std::string msg, cl::Error& e)
{
std::cerr
<< "Error: "
<< msg
<< ": " << e.what()
<< "(" << e.err() << ")"
<< std::endl;
exit(e.err());
}
int main(int argc, char *argv[])
{
// Print out run information
std::cout
<< "GPU-STREAM" << std::endl
<< "Version: " << VERSION_STRING << std::endl
<< "Implementation: OpenCL" << std::endl;
std::string status;
try
{
parseArguments(argc, argv);
if (NTIMES < 2)
throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
std::cout << "Precision: ";
if (useFloat) std::cout << "float";
else std::cout << "double";
std::cout << std::endl << std::endl;
std::cout << "Running kernels " << NTIMES << " times" << std::endl;
if (ARRAY_SIZE % 1024 != 0)
{
unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE;
ARRAY_SIZE -= ARRAY_SIZE % 1024;
std::cout
<< "Warning: array size must divide 1024" << std::endl
<< "Resizing array from " << OLD_ARRAY_SIZE
<< " to " << ARRAY_SIZE << std::endl;
if (ARRAY_SIZE == 0)
throw std::runtime_error("Array size must be >= 1024");
}
// Get precision (used to reset later)
std::streamsize ss = std::cout.precision();
size_t DATATYPE_SIZE;
if (useFloat)
{
DATATYPE_SIZE = sizeof(float);
}
else
{
DATATYPE_SIZE = sizeof(double);
}
// Display number of bytes in array
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
<< " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
<< std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
<< " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
<< std::endl;
// Reset precision
std::cout.precision(ss);
// Open the Kernel source
std::string kernels;
{
std::ifstream in("ocl-stream-kernels.cl");
if (!in.is_open())
throw std::runtime_error("Cannot open kernel file");
kernels = std::string (std::istreambuf_iterator<char>(in), (std::istreambuf_iterator<char>()));
}
// Setup OpenCL
// Get list of devices
std::vector<cl::Device> devices;
getDeviceList(devices);
// Check device index is in range
if (deviceIndex >= devices.size())
throw std::runtime_error("Chosen device index is invalid");
cl::Device device = devices[deviceIndex];
status = "Creating context";
cl::Context context(device);
status = "Creating queue";
cl::CommandQueue queue(context);
status = "Creating program";
cl::Program program(context, kernels);
// Print out device name
std::string name = getDeviceName(device);
std::cout << "Using OpenCL device " << name << std::endl;
// Print out OpenCL driver version for this device
std::string driver = getDeviceDriver(device);
std::cout << "Driver: " << driver << std::endl;
// Check device can do double precision if requested
if (!useFloat && !device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
throw std::runtime_error("Device does not support double precision, please use --float");
// Check buffers fit on the device
status = "Getting device memory sizes";
cl_ulong totalmem = device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
cl_ulong maxbuffer = device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
if (maxbuffer < DATATYPE_SIZE*ARRAY_SIZE)
throw std::runtime_error("Device cannot allocate a buffer big enough");
if (totalmem < 3*DATATYPE_SIZE*ARRAY_SIZE)
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
try
{
std::string options = "";
if (useFloat)
options = "-DFLOAT";
program.build(options.c_str());
}
catch (cl::Error& e)
{
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
std::string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
std::cerr
<< "Build error:"
<< buildlog
<< std::endl;
throw e;
}
status = "Making kernel copy";
auto copy = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "copy");
status = "Making kernel mul";
auto mul = cl::KernelFunctor<cl::Buffer&, cl::Buffer&>(program, "mul");
status = "Making kernel add";
auto add = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "add");
status = "Making kernel triad";
auto triad = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(program, "triad");
// Create host vectors
void *h_a = malloc(ARRAY_SIZE * DATATYPE_SIZE);
void *h_b = malloc(ARRAY_SIZE * DATATYPE_SIZE);
void *h_c = malloc(ARRAY_SIZE * DATATYPE_SIZE);
// Initilise arrays
for (unsigned int i = 0; i < ARRAY_SIZE; i++)
{
if (useFloat)
{
((float*)h_a)[i] = 1.0f;
((float*)h_b)[i] = 2.0f;
((float*)h_c)[i] = 0.0f;
}
else
{
((double*)h_a)[i] = 1.0;
((double*)h_b)[i] = 2.0;
((double*)h_c)[i] = 0.0;
}
}
// Create device buffers
status = "Creating buffers";
cl::Buffer d_a(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
cl::Buffer d_b(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
cl::Buffer d_c(context, CL_MEM_READ_WRITE, DATATYPE_SIZE * ARRAY_SIZE);
// Copy host memory to device
status = "Copying buffers";
queue.enqueueWriteBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
queue.enqueueWriteBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
queue.enqueueWriteBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
// Make sure the copies are finished
queue.finish();
// List of times
std::vector< std::vector<double> > timings;
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;
// Main loop
for (unsigned int k = 0; k < NTIMES; k++)
{
status = "Executing copy";
std::vector<double> times;
t1 = std::chrono::high_resolution_clock::now();
copy(
cl::EnqueueArgs(
queue,
cl::NDRange(ARRAY_SIZE)),
d_a, d_c);
queue.finish();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
status = "Executing mul";
t1 = std::chrono::high_resolution_clock::now();
mul(
cl::EnqueueArgs(
queue,
cl::NDRange(ARRAY_SIZE)),
d_b, d_c);
queue.finish();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
status = "Executing add";
t1 = std::chrono::high_resolution_clock::now();
add(
cl::EnqueueArgs(
queue,
cl::NDRange(ARRAY_SIZE)),
d_a, d_b, d_c);
queue.finish();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
status = "Executing triad";
t1 = std::chrono::high_resolution_clock::now();
triad(
cl::EnqueueArgs(
queue,
cl::NDRange(ARRAY_SIZE)),
d_a, d_b, d_c);
queue.finish();
t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
timings.push_back(times);
}
// Check solutions
status = "Copying back buffers";
queue.enqueueReadBuffer(d_a, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_a);
queue.enqueueReadBuffer(d_b, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_b);
queue.enqueueReadBuffer(d_c, CL_FALSE, 0, ARRAY_SIZE*DATATYPE_SIZE, h_c);
queue.finish();
if (useFloat)
{
check_solution<float>(h_a, h_b, h_c);
}
else
{
check_solution<double>(h_a, h_b, h_c);
}
// Crunch results
size_t sizes[4] = {
2 * DATATYPE_SIZE * ARRAY_SIZE,
2 * DATATYPE_SIZE * ARRAY_SIZE,
3 * DATATYPE_SIZE * ARRAY_SIZE,
3 * DATATYPE_SIZE * ARRAY_SIZE
};
double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX};
double max[4] = {0.0, 0.0, 0.0, 0.0};
double avg[4] = {0.0, 0.0, 0.0, 0.0};
// Ignore first result
for (unsigned int i = 1; i < NTIMES; i++)
{
for (int j = 0; j < 4; j++)
{
avg[j] += timings[i][j];
min[j] = std::min(min[j], timings[i][j]);
max[j] = std::max(max[j], timings[i][j]);
}
}
for (int j = 0; j < 4; j++)
avg[j] /= (double)(NTIMES-1);
// Display results
std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
std::cout
<< std::left << std::setw(12) << "Function"
<< std::left << std::setw(12) << "MBytes/sec"
<< std::left << std::setw(12) << "Min (sec)"
<< std::left << std::setw(12) << "Max"
<< std::left << std::setw(12) << "Average"
<< std::endl;
for (int j = 0; j < 4; j++)
{
std::cout
<< std::left << std::setw(12) << labels[j]
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j]
<< std::left << std::setw(12) << std::setprecision(5) << min[j]
<< std::left << std::setw(12) << std::setprecision(5) << max[j]
<< std::left << std::setw(12) << std::setprecision(5) << avg[j]
<< std::endl;
}
// Free host vectors
free(h_a);
free(h_b);
free(h_c);
}
catch (cl::Error &e)
{
die(status, e);
}
catch (std::exception& e)
{
std::cerr
<< "Error: "
<< e.what()
<< std::endl;
exit(EXIT_FAILURE);
}
}
unsigned getDeviceList(std::vector<cl::Device>& devices)
{
// Get list of platforms
std::vector<cl::Platform> platforms;
try
{
cl::Platform::get(&platforms);
}
catch (cl::Error &e)
{
die("Getting platforms", e);
}
// Enumerate devices
for (unsigned int i = 0; i < platforms.size(); i++)
{
std::vector<cl::Device> plat_devices;
try
{
platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
}
catch (cl::Error &e)
{
die("Getting devices", e);
}
devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
}
return devices.size();
}
std::string getDeviceName(const cl::Device& device)
{
std::string name;
cl_device_info info = CL_DEVICE_NAME;
try
{
// Special case for AMD
#ifdef CL_DEVICE_BOARD_NAME_AMD
device.getInfo(CL_DEVICE_VENDOR, &name);
if (strstr(name.c_str(), "Advanced Micro Devices"))
info = CL_DEVICE_BOARD_NAME_AMD;
#endif
device.getInfo(info, &name);
}
catch (cl::Error &e)
{
die("Getting device name", e);
}
return name;
}
std::string getDeviceDriver(const cl::Device& device)
{
std::string driver;
try
{
device.getInfo(CL_DRIVER_VERSION, &driver);
}
catch (cl::Error &e)
{
die("Getting device driver", e);
}
return driver;
}
void listDevices(void)
{
// Get list of devices
std::vector<cl::Device> devices;
getDeviceList(devices);
// Print device names
if (devices.size() == 0)
{
std::cout << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (unsigned i = 0; i < devices.size(); i++)
{
std::cout << i << ": " << getDeviceName(devices[i]) << std::endl;
}
std::cout << std::endl;
}
}

View File

@ -1,4 +0,0 @@
./gpu-stream-cuda
./gpu-stream-cuda --groups 64 --groupSize 256
./gpu-stream-cuda --float
./gpu-stream-cuda --float --groups 64 --groupSize 256

View File

@ -1,4 +0,0 @@
./gpu-stream-hip
./gpu-stream-hip --groups 64 --groupSize 256
./gpu-stream-hip --float
./gpu-stream-hip --float --groups 64 --groupSize 256