From 7621f867017e1c4e4a503af56744fb186698f360 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 3 Jan 2017 11:43:12 +0100 Subject: [PATCH] added pure HC gpu stream implmentation --- CMakeLists.txt | 29 +++++++ HCStream.cpp | 213 +++++++++++++++++++++++++++++++++++++++++++++++++ HCStream.h | 44 ++++++++++ main.cpp | 6 ++ 4 files changed, 292 insertions(+) create mode 100644 HCStream.cpp create mode 100644 HCStream.h diff --git a/CMakeLists.txt b/CMakeLists.txt index efee733..5551c54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,15 @@ if(NOT DEFINED HIP_PATH) endif() set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +if(NOT DEFINED HCC_PATH) + if(NOT DEFINED ENV{HCC_PATH}) + set(HCC_PATH "/opt/rocm/" CACHE PATH "Path to which HCC has been installed") + else() + set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed") + endif() +endif() + + set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -49,6 +58,26 @@ else() message("Skipping HIP...") endif() +#------------------------------------------------------------------------------- +# HCC +#------------------------------------------------------------------------------- +find_program(HCC_BINARY hcc HINTS ${HCC_PATH}/bin PATHS ${HCC_PATH}/bin) +if(EXISTS ${HCC_BINARY}) + #can the following be tied to the target only? + set(CMAKE_CXX_COMPILER ${HCC_BINARY}) + # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include) + # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small ) + add_executable(gpu-stream-hc main.cpp HCStream.cpp) + set_target_properties(gpu-stream-hc PROPERTIES + COMPILE_FLAGS "-hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" + LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small" + ) + target_compile_definitions(gpu-stream-hc PUBLIC HC) +else() + message("Skipping HC...") +endif() + + #------------------------------------------------------------------------------- # CUDA #------------------------------------------------------------------------------- diff --git a/HCStream.cpp b/HCStream.cpp new file mode 100644 index 0000000..0c55af3 --- /dev/null +++ b/HCStream.cpp @@ -0,0 +1,213 @@ +// Copyright (c) 2015-16 Peter Steinbach, MPI CBG Scientific Computing Facility +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include +#include +#include + + +#include "HCStream.h" +//#include "hc.hpp" + +#define TBSIZE 1024 + +std::string getDeviceName(const hc::accelerator& _acc) +{ + std::wstring_convert, wchar_t> converter; + std::string value = converter.to_bytes(_acc.get_description()); + return value; +} + +void listDevices(void) +{ + // Get number of devices + std::vector accs = hc::accelerator::get_all(); + + // Print device names + if (accs.empty()) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < accs.size(); i++) + { + std::cout << i << ": " << getDeviceName(accs[i]) << std::endl; + } + std::cout << std::endl; + } +} + +// void check_error(void) +// { +// hipError_t err = hipGetLastError(); +// if (err != hipSuccess) +// { +// std::cerr << "Error: " << hipGetErrorString(err) << std::endl; +// exit(err); +// } +// } + +template +HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): + array_size(ARRAY_SIZE), + d_a(ARRAY_SIZE), + d_b(ARRAY_SIZE), + d_c(ARRAY_SIZE) +{ + + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // // Set device + std::vector accs = hc::accelerator::get_all(); + auto current = accs[device_index]; + + std::cout << "Using HC device " << getDeviceName(current) << std::endl; + + // // The array size must be divisible by TBSIZE for kernel launches + // if (ARRAY_SIZE % TBSIZE != 0) + // { + // std::stringstream ss; + // ss << "Array size must be a multiple of " << TBSIZE; + // throw std::runtime_error(ss.str()); + // } + + // // Set device + // int count; + // hipGetDeviceCount(&count); + // check_error(); + // if (device_index >= count) + // throw std::runtime_error("Invalid device index"); + // hipSetDevice(device_index); + // check_error(); + + // // Print out device information + // std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; + // std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // array_size = ARRAY_SIZE; + + // // Check buffers fit on the device + // hipDeviceProp_t props; + // hipGetDeviceProperties(&props, 0); + // if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + // throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // // Create device buffers + // hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + // check_error(); + // hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + // check_error(); + // hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + // check_error(); + + +} + + +template +HCStream::~HCStream() +{ +} + +template +void HCStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + hc::copy(a.cbegin(),a.cend(),d_a); + hc::copy(b.cbegin(),b.cend(),d_b); + hc::copy(c.cbegin(),c.cend(),d_c); +} + +template +void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + // Copy device memory to host + hc::copy(d_a,a.begin()); + hc::copy(d_b,b.begin()); + hc::copy(d_c,c.begin()); +} + + +template +void HCStream::copy() +{ + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_c[i] = d_a[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template +void HCStream::mul() +{ + const T scalar = 0.3; + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_b[i] = scalar*d_c[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template +void HCStream::add() +{ + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_c[i] = d_a[i]+d_b[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template +void HCStream::triad() +{ + const T scalar = 0.3; + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_a[i] = d_b[i] + scalar*d_c[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template class HCStream; +template class HCStream; diff --git a/HCStream.h b/HCStream.h new file mode 100644 index 0000000..a625be0 --- /dev/null +++ b/HCStream.h @@ -0,0 +1,44 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" +#include "hc.hpp" + +#define IMPLEMENTATION_STRING "HC" + +template +class HCStream : public Stream +{ +protected: + // Size of arrays + unsigned int array_size; + // Device side pointers to arrays + hc::array d_a; + hc::array d_b; + hc::array d_c; + + +public: + + HCStream(const unsigned int, const int); + ~HCStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; diff --git a/main.cpp b/main.cpp index 6a15aa7..50ab648 100644 --- a/main.cpp +++ b/main.cpp @@ -22,6 +22,8 @@ #include "CUDAStream.h" #elif defined(HIP) #include "HIPStream.h" +#elif defined(HC) +#include "HCStream.h" #elif defined(OCL) #include "OCLStream.h" #elif defined(USE_RAJA) @@ -105,6 +107,10 @@ void run() // Use the HIP implementation stream = new HIPStream(ARRAY_SIZE, deviceIndex); +#elif defined(HC) + // Use the HC implementation + stream = new HCStream(ARRAY_SIZE, deviceIndex); + #elif defined(OCL) // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex);