Add support of HIP version of GPU-STREAM.

This commit was tested with HIP developer preview branch.
This commit is contained in:
pensun 2016-09-05 23:41:01 -05:00
parent 21f0bcfde4
commit a1f9d9ece7
4 changed files with 286 additions and 0 deletions

View File

@ -1,6 +1,16 @@
cmake_minimum_required(VERSION 3.2) cmake_minimum_required(VERSION 3.2)
if(NOT DEFINED HIP_PATH)
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
else()
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
endif()
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
@ -27,6 +37,18 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
list(APPEND CMAKE_CXX_FLAGS -hstd=c++11) list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
endif () endif ()
#-------------------------------------------------------------------------------
# HIP
#-------------------------------------------------------------------------------
find_package(HIP QUIET)
if(${HIP_FOUND})
list(APPEND HIP_HIPCC_FLAGS --std=c++11)
hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu)
target_compile_definitions(gpu-stream-hip PUBLIC HIP)
else()
message("Skipping HIP...")
endif()
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# CUDA # CUDA
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------

215
HIPStream.cu Normal file
View File

@ -0,0 +1,215 @@
#include "hip/hip_runtime.h"
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "HIPStream.h"
#define TBSIZE 1024
void check_error(void)
{
hipError_t err = hipGetLastError();
if (err != hipSuccess)
{
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
exit(err);
}
}
template <class T>
HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
{
// The array size must be divisible by TBSIZE for kernel launches
if (ARRAY_SIZE % TBSIZE != 0)
{
std::stringstream ss;
ss << "Array size must be a multiple of " << TBSIZE;
throw std::runtime_error(ss.str());
}
// Set device
int count;
hipGetDeviceCount(&count);
check_error();
if (device_index >= count)
throw std::runtime_error("Invalid device index");
hipSetDevice(device_index);
check_error();
// Print out device information
std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
array_size = ARRAY_SIZE;
// Check buffers fit on the device
hipDeviceProp_t props;
hipGetDeviceProperties(&props, 0);
if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers
hipMalloc(&d_a, ARRAY_SIZE*sizeof(T));
check_error();
hipMalloc(&d_b, ARRAY_SIZE*sizeof(T));
check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
check_error();
}
template <class T>
HIPStream<T>::~HIPStream()
{
hipFree(d_a);
check_error();
hipFree(d_b);
check_error();
hipFree(d_c);
check_error();
}
template <class T>
void HIPStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
// Copy host memory to device
hipMemcpy(d_a, a.data(), a.size()*sizeof(T), hipMemcpyHostToDevice);
check_error();
hipMemcpy(d_b, b.data(), b.size()*sizeof(T), hipMemcpyHostToDevice);
check_error();
hipMemcpy(d_c, c.data(), c.size()*sizeof(T), hipMemcpyHostToDevice);
check_error();
}
template <class T>
void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
// Copy device memory to host
hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error();
hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error();
hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error();
}
template <typename T>
__global__ void copy_kernel(hipLaunchParm lp, const T * a, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
c[i] = a[i];
}
template <class T>
void HIPStream<T>::copy()
{
hipLaunchKernel(HIP_KERNEL_NAME(copy_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void mul_kernel(hipLaunchParm lp, T * b, const T * c)
{
const T scalar = 0.3;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
b[i] = scalar * c[i];
}
template <class T>
void HIPStream<T>::mul()
{
hipLaunchKernel(HIP_KERNEL_NAME(mul_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void add_kernel(hipLaunchParm lp, const T * a, const T * b, T * c)
{
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
c[i] = a[i] + b[i];
}
template <class T>
void HIPStream<T>::add()
{
hipLaunchKernel(HIP_KERNEL_NAME(add_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
template <typename T>
__global__ void triad_kernel(hipLaunchParm lp, T * a, const T * b, const T * c)
{
const T scalar = 0.3;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
a[i] = b[i] + scalar * c[i];
}
template <class T>
void HIPStream<T>::triad()
{
hipLaunchKernel(HIP_KERNEL_NAME(triad_kernel), dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0, d_a, d_b, d_c);
check_error();
hipDeviceSynchronize();
check_error();
}
void listDevices(void)
{
// Get number of devices
int count;
hipGetDeviceCount(&count);
check_error();
// Print device names
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
for (int i = 0; i < count; i++)
{
std::cout << i << ": " << getDeviceName(i) << std::endl;
}
std::cout << std::endl;
}
}
std::string getDeviceName(const int device)
{
hipDeviceProp_t props;
hipGetDeviceProperties(&props, device);
check_error();
return std::string(props.name);
}
std::string getDeviceDriver(const int device)
{
hipSetDevice(device);
check_error();
int driver;
hipDriverGetVersion(&driver);
check_error();
return std::to_string(driver);
}
template class HIPStream<float>;
template class HIPStream<double>;

43
HIPStream.h Normal file
View File

@ -0,0 +1,43 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include <iostream>
#include <stdexcept>
#include <sstream>
#include "Stream.h"
#define IMPLEMENTATION_STRING "HIP"
template <class T>
class HIPStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers to arrays
T *d_a;
T *d_b;
T *d_c;
public:
HIPStream(const unsigned int, const int);
~HIPStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -20,6 +20,8 @@
#if defined(CUDA) #if defined(CUDA)
#include "CUDAStream.h" #include "CUDAStream.h"
#elif defined(HIP)
#include "HIPStream.h"
#elif defined(OCL) #elif defined(OCL)
#include "OCLStream.h" #include "OCLStream.h"
#elif defined(USE_RAJA) #elif defined(USE_RAJA)
@ -100,6 +102,10 @@ void run()
// Use the CUDA implementation // Use the CUDA implementation
stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex); stream = new CUDAStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(HIP)
// Use the HIP implementation
stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(OCL) #elif defined(OCL)
// Use the OpenCL implementation // Use the OpenCL implementation
stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex); stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);