From fdeb20601fc33d76383b2f833cb902f80225d025 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Mon, 14 Mar 2016 11:44:30 -0500 Subject: [PATCH] Pull request for HIP version --- Makefile | 18 +- README.md | 7 + hip-stream.cpp | 398 ++++++++++++++++++++++++++++ results/cuda/nvidia-gtx-titan_x.txt | 15 ++ results/hip/amd-fiji-nano.txt | 15 ++ results/hip/nvidia-gtx-titan_x.txt | 22 ++ 6 files changed, 473 insertions(+), 2 deletions(-) create mode 100644 hip-stream.cpp create mode 100644 results/cuda/nvidia-gtx-titan_x.txt create mode 100644 results/hip/amd-fiji-nano.txt create mode 100644 results/hip/nvidia-gtx-titan_x.txt diff --git a/Makefile b/Makefile index 4fb5f7a..c2bc9b2 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,8 @@ ifeq ($(PLATFORM), Darwin) LDLIBS = -framework OpenCL endif -all: gpu-stream-ocl gpu-stream-cuda +all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip + gpu-stream-ocl: ocl-stream.cpp common.o Makefile $(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS) @@ -19,9 +20,22 @@ ifeq ($(shell which nvcc > /dev/null; echo $$?), 0) else $(error "Cannot find nvcc, please install CUDA toolkit") endif +HIP_PATH?=../../.. +HIPCC=$(HIP_PATH)/bin/hipcc + +hip-stream.o : hip-stream.cpp + $(HIPCC) $(CXXFLAGS) -c $< -o $@ + +gpu-stream-hip: hip-stream.o common.o Makefile +ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) + $(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@ +else + $(error "Cannot find $(HIPCC), please install HIP toolkit") +endif + .PHONY: clean clean: - rm -f gpu-stream-ocl gpu-stream-cuda *.o + rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o diff --git a/README.md b/README.md index 61907e1..a2f6ab6 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,13 @@ Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6 Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda` +For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: +http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ +Clone from the HIP repository in the following link: +https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP + +Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' + Android ------- diff --git a/hip-stream.cpp b/hip-stream.cpp new file mode 100644 index 0000000..ddbcae4 --- /dev/null +++ b/hip-stream.cpp @@ -0,0 +1,398 @@ +#include "hip_runtime.h" +/*============================================================================= +*------------------------------------------------------------------------------ +* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC +* Based on John D. McCalpin’s original STREAM benchmark for CPUs +*------------------------------------------------------------------------------ +* License: +* 1. You are free to use this program and/or to redistribute +* this program. +* 2. You are free to modify this program for your own use, +* including commercial use, subject to the publication +* restrictions in item 3. +* 3. You are free to publish results obtained from running this +* program, or from works that you derive from this program, +* with the following limitations: +* 3a. In order to be referred to as "GPU-STREAM benchmark results", +* published results must be in conformance to the GPU-STREAM +* Run Rules published at +* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules +* and incorporated herein by reference. +* The copyright holders retain the +* right to determine conformity with the Run Rules. +* 3b. Results based on modified source code or on runs not in +* accordance with the GPU-STREAM Run Rules must be clearly +* labelled whenever they are published. Examples of +* proper labelling include: +* "tuned GPU-STREAM benchmark results" +* "based on a variant of the GPU-STREAM benchmark code" +* Other comparable, clear and reasonable labelling is +* acceptable. +* 3c. Submission of results to the GPU-STREAM benchmark web site +* is encouraged, but not required. +* 4. Use of this program or creation of derived works based on this +* program constitutes acceptance of these licensing restrictions. +* 5. Absolutely no warranty is expressed or implied. +*———————————————————————————————————-----------------------------------------*/ + + +#include +#include +#include +#include +#include +#include + +#include +#include "common.h" + +std::string getDeviceName(int device); +int getDriver(void); + +// Code to check CUDA errors +void check_cuda_error(void) +{ + hipError_t err = hipGetLastError(); + if (err != hipSuccess) + { + std::cerr + << "Error: " + << hipGetErrorString(err) + << std::endl; + exit(err); + } +} + +template +__global__ void copy(hipLaunchParm lp, const T * a, T * c) +{ + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + c[i] = a[i]; +} + +template +__global__ void mul(hipLaunchParm lp, T * b, const T * c) +{ + const T scalar = 3.0; + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + b[i] = scalar * c[i]; +} + +template +__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c) +{ + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + c[i] = a[i] + b[i]; +} + +template +__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c) +{ + const T scalar = 3.0; + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] = b[i] + scalar * c[i]; +} + +int main(int argc, char *argv[]) +{ + + // Print out run information + std::cout + << "GPU-STREAM" << std::endl + << "Version: " << VERSION_STRING << std::endl + << "Implementation: CUDA" << std::endl; + + parseArguments(argc, argv); + + if (NTIMES < 2) + throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + + std::cout << "Precision: "; + if (useFloat) std::cout << "float"; + else std::cout << "double"; + std::cout << std::endl << std::endl; + + std::cout << "Running kernels " << NTIMES << " times" << std::endl; + + if (ARRAY_SIZE % 1024 != 0) + { + unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; + ARRAY_SIZE -= ARRAY_SIZE % 1024; + std::cout + << "Warning: array size must divide 1024" << std::endl + << "Resizing array from " << OLD_ARRAY_SIZE + << " to " << ARRAY_SIZE << std::endl; + if (ARRAY_SIZE == 0) + throw std::runtime_error("Array size must be >= 1024"); + } + + // Get precision (used to reset later) + std::streamsize ss = std::cout.precision(); + + size_t DATATYPE_SIZE; + + if (useFloat) + { + DATATYPE_SIZE = sizeof(float); + } + else + { + DATATYPE_SIZE = sizeof(double); + } + + // Display number of bytes in array + std::cout << std::setprecision(1) << std::fixed + << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" + << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" + << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << std::endl; + + // Reset precision + std::cout.precision(ss); + + // Check device index is in range + int count; + hipGetDeviceCount(&count); + check_cuda_error(); + if (deviceIndex >= count) + throw std::runtime_error("Chosen device index is invalid"); + hipSetDevice(deviceIndex); + check_cuda_error(); + + // Print out device name + std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; + + // Print out device CUDA driver version + std::cout << "Driver: " << getDriver() << std::endl; + + // Check buffers fit on the device + hipDeviceProp_t props; + hipGetDeviceProperties(&props, deviceIndex); + if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // Create host vectors + void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); + void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); + void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); + + // Initilise arrays + for (unsigned int i = 0; i < ARRAY_SIZE; i++) + { + if (useFloat) + { + ((float*)h_a)[i] = 1.0f; + ((float*)h_b)[i] = 2.0f; + ((float*)h_c)[i] = 0.0f; + } + else + { + ((double*)h_a)[i] = 1.0; + ((double*)h_b)[i] = 2.0; + ((double*)h_c)[i] = 0.0; + } + } + + // Create device buffers + void * d_a, * d_b, *d_c; + hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); + check_cuda_error(); + hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); + check_cuda_error(); + hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); + check_cuda_error(); + + // Copy host memory to device + hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + + // Make sure the copies are finished + hipDeviceSynchronize(); + check_cuda_error(); + + // List of times + std::vector< std::vector > timings; + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Main loop + for (unsigned int k = 0; k < NTIMES; k++) + { + std::vector times; + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + timings.push_back(times); + + } + + // Check solutions + hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + + if (useFloat) + { + check_solution(h_a, h_b, h_c); + } + else + { + check_solution(h_a, h_b, h_c); + } + + // Crunch results + size_t sizes[4] = { + 2 * DATATYPE_SIZE * ARRAY_SIZE, + 2 * DATATYPE_SIZE * ARRAY_SIZE, + 3 * DATATYPE_SIZE * ARRAY_SIZE, + 3 * DATATYPE_SIZE * ARRAY_SIZE + }; + double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; + double max[4] = {0.0, 0.0, 0.0, 0.0}; + double avg[4] = {0.0, 0.0, 0.0, 0.0}; + + // Ignore first result + for (unsigned int i = 1; i < NTIMES; i++) + { + for (int j = 0; j < 4; j++) + { + avg[j] += timings[i][j]; + min[j] = std::min(min[j], timings[i][j]); + max[j] = std::max(max[j], timings[i][j]); + } + } + + for (int j = 0; j < 4; j++) + avg[j] /= (double)(NTIMES-1); + + // Display results + std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; + std::cout + << std::left << std::setw(12) << "Function" + << std::left << std::setw(12) << "MBytes/sec" + << std::left << std::setw(12) << "Min (sec)" + << std::left << std::setw(12) << "Max" + << std::left << std::setw(12) << "Average" + << std::endl; + + for (int j = 0; j < 4; j++) + { + std::cout + << std::left << std::setw(12) << labels[j] + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] + << std::left << std::setw(12) << std::setprecision(5) << min[j] + << std::left << std::setw(12) << std::setprecision(5) << max[j] + << std::left << std::setw(12) << std::setprecision(5) << avg[j] + << std::endl; + } + + // Free host vectors + free(h_a); + free(h_b); + free(h_c); + + // Free cuda buffers + hipFree(d_a); + check_cuda_error(); + hipFree(d_b); + check_cuda_error(); + hipFree(d_c); + check_cuda_error(); + +} + +std::string getDeviceName(int device) +{ + struct hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, device); + check_cuda_error(); + return std::string(prop.name); +} + +int getDriver(void) +{ + int driver; + hipDriverGetVersion(&driver); + check_cuda_error(); + return driver; +} + +void listDevices(void) +{ + // Get number of devices + int count; + hipGetDeviceCount(&count); + check_cuda_error(); + + // Print device names + if (count == 0) + { + std::cout << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + check_cuda_error(); + } + std::cout << std::endl; + } +} + diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..2d3b004 --- /dev/null +++ b/results/cuda/nvidia-gtx-titan_x.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device GeForce GTX TITAN X +Driver: 7050 +Function MBytes/sec Min (sec) Max Average +Copy 263155.587 0.00319 0.00319 0.00319 +Mul 262943.430 0.00319 0.00319 0.00319 +Add 268710.444 0.00468 0.00469 0.00469 +Triad 268957.305 0.00468 0.00469 0.00468 diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt new file mode 100644 index 0000000..df40ac8 --- /dev/null +++ b/results/hip/amd-fiji-nano.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device Fiji +Driver: 4 +Function MBytes/sec Min (sec) Max Average +Copy 375822.410 0.00223 0.00225 0.00224 +Mul 375086.879 0.00224 0.00227 0.00224 +Add 425650.718 0.00296 0.00298 0.00297 +Triad 424710.113 0.00296 0.00298 0.00298 diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..6bb94a3 --- /dev/null +++ b/results/hip/nvidia-gtx-titan_x.txt @@ -0,0 +1,22 @@ +GPU-STREAM +Version: 1.0 +Implementation: HIP +GridSize: 52428800 work-items +GroupSize: 1024 work-items +Operations/Work-item: 1 +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) 0 bytes padding +Total size: 1200.0 MB (=1.2 GB) +Using HIP device GeForce GTX TITAN X (compute_units=24) +Driver: 4 +d_a=0x1306d80000 +d_b=0x131fd80000 +d_c=0x1338d80000 +Function MBytes/sec Min (sec) Max Average +Copy 263042.207 0.00319 0.00320 0.00319 +Mul 262972.033 0.00319 0.00320 0.00319 +Add 268732.653 0.00468 0.00469 0.00469 +Triad 268706.197 0.00468 0.00469 0.00469 +GEOMEAN 265847.929