From fdeb20601fc33d76383b2f833cb902f80225d025 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Mon, 14 Mar 2016 11:44:30 -0500 Subject: [PATCH 01/10] Pull request for HIP version --- Makefile | 18 +- README.md | 7 + hip-stream.cpp | 398 ++++++++++++++++++++++++++++ results/cuda/nvidia-gtx-titan_x.txt | 15 ++ results/hip/amd-fiji-nano.txt | 15 ++ results/hip/nvidia-gtx-titan_x.txt | 22 ++ 6 files changed, 473 insertions(+), 2 deletions(-) create mode 100644 hip-stream.cpp create mode 100644 results/cuda/nvidia-gtx-titan_x.txt create mode 100644 results/hip/amd-fiji-nano.txt create mode 100644 results/hip/nvidia-gtx-titan_x.txt diff --git a/Makefile b/Makefile index 4fb5f7a..c2bc9b2 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,8 @@ ifeq ($(PLATFORM), Darwin) LDLIBS = -framework OpenCL endif -all: gpu-stream-ocl gpu-stream-cuda +all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip + gpu-stream-ocl: ocl-stream.cpp common.o Makefile $(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS) @@ -19,9 +20,22 @@ ifeq ($(shell which nvcc > /dev/null; echo $$?), 0) else $(error "Cannot find nvcc, please install CUDA toolkit") endif +HIP_PATH?=../../.. +HIPCC=$(HIP_PATH)/bin/hipcc + +hip-stream.o : hip-stream.cpp + $(HIPCC) $(CXXFLAGS) -c $< -o $@ + +gpu-stream-hip: hip-stream.o common.o Makefile +ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) + $(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@ +else + $(error "Cannot find $(HIPCC), please install HIP toolkit") +endif + .PHONY: clean clean: - rm -f gpu-stream-ocl gpu-stream-cuda *.o + rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o diff --git a/README.md b/README.md index 61907e1..a2f6ab6 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,13 @@ Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6 Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda` +For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: +http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ +Clone from the HIP repository in the following link: +https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP + +Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' + Android ------- diff --git a/hip-stream.cpp b/hip-stream.cpp new file mode 100644 index 0000000..ddbcae4 --- /dev/null +++ b/hip-stream.cpp @@ -0,0 +1,398 @@ +#include "hip_runtime.h" +/*============================================================================= +*------------------------------------------------------------------------------ +* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC +* Based on John D. McCalpin’s original STREAM benchmark for CPUs +*------------------------------------------------------------------------------ +* License: +* 1. You are free to use this program and/or to redistribute +* this program. +* 2. You are free to modify this program for your own use, +* including commercial use, subject to the publication +* restrictions in item 3. +* 3. You are free to publish results obtained from running this +* program, or from works that you derive from this program, +* with the following limitations: +* 3a. In order to be referred to as "GPU-STREAM benchmark results", +* published results must be in conformance to the GPU-STREAM +* Run Rules published at +* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules +* and incorporated herein by reference. +* The copyright holders retain the +* right to determine conformity with the Run Rules. +* 3b. Results based on modified source code or on runs not in +* accordance with the GPU-STREAM Run Rules must be clearly +* labelled whenever they are published. Examples of +* proper labelling include: +* "tuned GPU-STREAM benchmark results" +* "based on a variant of the GPU-STREAM benchmark code" +* Other comparable, clear and reasonable labelling is +* acceptable. +* 3c. Submission of results to the GPU-STREAM benchmark web site +* is encouraged, but not required. +* 4. Use of this program or creation of derived works based on this +* program constitutes acceptance of these licensing restrictions. +* 5. Absolutely no warranty is expressed or implied. +*———————————————————————————————————-----------------------------------------*/ + + +#include +#include +#include +#include +#include +#include + +#include +#include "common.h" + +std::string getDeviceName(int device); +int getDriver(void); + +// Code to check CUDA errors +void check_cuda_error(void) +{ + hipError_t err = hipGetLastError(); + if (err != hipSuccess) + { + std::cerr + << "Error: " + << hipGetErrorString(err) + << std::endl; + exit(err); + } +} + +template +__global__ void copy(hipLaunchParm lp, const T * a, T * c) +{ + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + c[i] = a[i]; +} + +template +__global__ void mul(hipLaunchParm lp, T * b, const T * c) +{ + const T scalar = 3.0; + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + b[i] = scalar * c[i]; +} + +template +__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c) +{ + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + c[i] = a[i] + b[i]; +} + +template +__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c) +{ + const T scalar = 3.0; + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] = b[i] + scalar * c[i]; +} + +int main(int argc, char *argv[]) +{ + + // Print out run information + std::cout + << "GPU-STREAM" << std::endl + << "Version: " << VERSION_STRING << std::endl + << "Implementation: CUDA" << std::endl; + + parseArguments(argc, argv); + + if (NTIMES < 2) + throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + + std::cout << "Precision: "; + if (useFloat) std::cout << "float"; + else std::cout << "double"; + std::cout << std::endl << std::endl; + + std::cout << "Running kernels " << NTIMES << " times" << std::endl; + + if (ARRAY_SIZE % 1024 != 0) + { + unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; + ARRAY_SIZE -= ARRAY_SIZE % 1024; + std::cout + << "Warning: array size must divide 1024" << std::endl + << "Resizing array from " << OLD_ARRAY_SIZE + << " to " << ARRAY_SIZE << std::endl; + if (ARRAY_SIZE == 0) + throw std::runtime_error("Array size must be >= 1024"); + } + + // Get precision (used to reset later) + std::streamsize ss = std::cout.precision(); + + size_t DATATYPE_SIZE; + + if (useFloat) + { + DATATYPE_SIZE = sizeof(float); + } + else + { + DATATYPE_SIZE = sizeof(double); + } + + // Display number of bytes in array + std::cout << std::setprecision(1) << std::fixed + << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" + << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" + << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << std::endl; + + // Reset precision + std::cout.precision(ss); + + // Check device index is in range + int count; + hipGetDeviceCount(&count); + check_cuda_error(); + if (deviceIndex >= count) + throw std::runtime_error("Chosen device index is invalid"); + hipSetDevice(deviceIndex); + check_cuda_error(); + + // Print out device name + std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; + + // Print out device CUDA driver version + std::cout << "Driver: " << getDriver() << std::endl; + + // Check buffers fit on the device + hipDeviceProp_t props; + hipGetDeviceProperties(&props, deviceIndex); + if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // Create host vectors + void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); + void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); + void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); + + // Initilise arrays + for (unsigned int i = 0; i < ARRAY_SIZE; i++) + { + if (useFloat) + { + ((float*)h_a)[i] = 1.0f; + ((float*)h_b)[i] = 2.0f; + ((float*)h_c)[i] = 0.0f; + } + else + { + ((double*)h_a)[i] = 1.0; + ((double*)h_b)[i] = 2.0; + ((double*)h_c)[i] = 0.0; + } + } + + // Create device buffers + void * d_a, * d_b, *d_c; + hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); + check_cuda_error(); + hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); + check_cuda_error(); + hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); + check_cuda_error(); + + // Copy host memory to device + hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + + // Make sure the copies are finished + hipDeviceSynchronize(); + check_cuda_error(); + + // List of times + std::vector< std::vector > timings; + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Main loop + for (unsigned int k = 0; k < NTIMES; k++) + { + std::vector times; + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + timings.push_back(times); + + } + + // Check solutions + hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + + if (useFloat) + { + check_solution(h_a, h_b, h_c); + } + else + { + check_solution(h_a, h_b, h_c); + } + + // Crunch results + size_t sizes[4] = { + 2 * DATATYPE_SIZE * ARRAY_SIZE, + 2 * DATATYPE_SIZE * ARRAY_SIZE, + 3 * DATATYPE_SIZE * ARRAY_SIZE, + 3 * DATATYPE_SIZE * ARRAY_SIZE + }; + double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; + double max[4] = {0.0, 0.0, 0.0, 0.0}; + double avg[4] = {0.0, 0.0, 0.0, 0.0}; + + // Ignore first result + for (unsigned int i = 1; i < NTIMES; i++) + { + for (int j = 0; j < 4; j++) + { + avg[j] += timings[i][j]; + min[j] = std::min(min[j], timings[i][j]); + max[j] = std::max(max[j], timings[i][j]); + } + } + + for (int j = 0; j < 4; j++) + avg[j] /= (double)(NTIMES-1); + + // Display results + std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; + std::cout + << std::left << std::setw(12) << "Function" + << std::left << std::setw(12) << "MBytes/sec" + << std::left << std::setw(12) << "Min (sec)" + << std::left << std::setw(12) << "Max" + << std::left << std::setw(12) << "Average" + << std::endl; + + for (int j = 0; j < 4; j++) + { + std::cout + << std::left << std::setw(12) << labels[j] + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] + << std::left << std::setw(12) << std::setprecision(5) << min[j] + << std::left << std::setw(12) << std::setprecision(5) << max[j] + << std::left << std::setw(12) << std::setprecision(5) << avg[j] + << std::endl; + } + + // Free host vectors + free(h_a); + free(h_b); + free(h_c); + + // Free cuda buffers + hipFree(d_a); + check_cuda_error(); + hipFree(d_b); + check_cuda_error(); + hipFree(d_c); + check_cuda_error(); + +} + +std::string getDeviceName(int device) +{ + struct hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, device); + check_cuda_error(); + return std::string(prop.name); +} + +int getDriver(void) +{ + int driver; + hipDriverGetVersion(&driver); + check_cuda_error(); + return driver; +} + +void listDevices(void) +{ + // Get number of devices + int count; + hipGetDeviceCount(&count); + check_cuda_error(); + + // Print device names + if (count == 0) + { + std::cout << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + check_cuda_error(); + } + std::cout << std::endl; + } +} + diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..2d3b004 --- /dev/null +++ b/results/cuda/nvidia-gtx-titan_x.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device GeForce GTX TITAN X +Driver: 7050 +Function MBytes/sec Min (sec) Max Average +Copy 263155.587 0.00319 0.00319 0.00319 +Mul 262943.430 0.00319 0.00319 0.00319 +Add 268710.444 0.00468 0.00469 0.00469 +Triad 268957.305 0.00468 0.00469 0.00468 diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt new file mode 100644 index 0000000..df40ac8 --- /dev/null +++ b/results/hip/amd-fiji-nano.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device Fiji +Driver: 4 +Function MBytes/sec Min (sec) Max Average +Copy 375822.410 0.00223 0.00225 0.00224 +Mul 375086.879 0.00224 0.00227 0.00224 +Add 425650.718 0.00296 0.00298 0.00297 +Triad 424710.113 0.00296 0.00298 0.00298 diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..6bb94a3 --- /dev/null +++ b/results/hip/nvidia-gtx-titan_x.txt @@ -0,0 +1,22 @@ +GPU-STREAM +Version: 1.0 +Implementation: HIP +GridSize: 52428800 work-items +GroupSize: 1024 work-items +Operations/Work-item: 1 +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) 0 bytes padding +Total size: 1200.0 MB (=1.2 GB) +Using HIP device GeForce GTX TITAN X (compute_units=24) +Driver: 4 +d_a=0x1306d80000 +d_b=0x131fd80000 +d_c=0x1338d80000 +Function MBytes/sec Min (sec) Max Average +Copy 263042.207 0.00319 0.00320 0.00319 +Mul 262972.033 0.00319 0.00320 0.00319 +Add 268732.653 0.00468 0.00469 0.00469 +Triad 268706.197 0.00468 0.00469 0.00469 +GEOMEAN 265847.929 From 11053798ff7dca2bd8eb7a8e5bacac10758b5159 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Tue, 15 Mar 2016 07:56:32 -0500 Subject: [PATCH 02/10] Improved GPU-STREAM benchmark for HIP version: 1. Add optional looper kernels to take command line input for the number of groups and groupSize 2. Add GEOMEAN value calculation of the kernels 3. Instructions on configure HIP environment in the README.md 4. Add results for HIP on FIJI Nano, TITAN X; CUDA on TITAN X 5. Run script to optionally run HIP version with groups and groupSize options --- README.md | 4 +- common.cpp | 50 ++++++++++++ common.h | 3 + hip-stream.cpp | 213 +++++++++++++++++++++++++++++++++++++++---------- runhip.sh | 4 + 5 files changed, 233 insertions(+), 41 deletions(-) create mode 100755 runhip.sh diff --git a/README.md b/README.md index a2f6ab6..c776347 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu- For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ -Clone from the HIP repository in the following link: +Install the HCC compiler: +https://bitbucket.org/multicoreware/hcc/wiki/Home +Install HIP: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' diff --git a/common.cpp b/common.cpp index 781d70e..122376f 100644 --- a/common.cpp +++ b/common.cpp @@ -39,10 +39,13 @@ // Default array size 50 * 2^20 (50*8 Mebibytes double precision) // Use binary powers of two so divides 1024 unsigned int ARRAY_SIZE = 52428800; +size_t ARRAY_PAD_BYTES = 0; unsigned int NTIMES = 10; bool useFloat = false; +unsigned int groups = 0; +unsigned int groupSize = 1024; unsigned int deviceIndex = 0; @@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output) return !strlen(next); } +int parseSize(const char *str, size_t *output) +{ + char *next; + *output = strtoull(str, &next, 0); + int l = strlen(str); + if (l) { + char c = str[l-1]; // last char. + if ((c == 'k') || (c == 'K')) { + *output *= 1024; + } + if ((c == 'm') || (c == 'M')) { + *output *= (1024*1024); + } + + } + return !strlen(next); +} + + void parseArguments(int argc, char *argv[]) { for (int i = 1; i < argc; i++) @@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[]) exit(1); } } + else if (!strcmp(argv[i], "--groups")) + { + if (++i >= argc || !parseUInt(argv[i], &groups)) + { + std::cout << "Invalid group number" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--groupSize")) + { + if (++i >= argc || !parseUInt(argv[i], &groupSize)) + { + std::cout << "Invalid group size" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--pad")) + { + if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES)) + { + std::cout << "Invalid size" << std::endl; + exit(1); + } + + } else if (!strcmp(argv[i], "--float")) { useFloat = true; @@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[]) std::cout << " --device INDEX Select device at INDEX" << std::endl; std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; + std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl; + std::cout << " --groupSize Set size of each group (default 1024)" << std::endl; + std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << std::endl; exit(0); diff --git a/common.h b/common.h index a4dd886..9cf61d7 100644 --- a/common.h +++ b/common.h @@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]); extern void listDevices(void); extern unsigned int ARRAY_SIZE; +extern size_t ARRAY_PAD_BYTES; extern unsigned int NTIMES; +extern unsigned int groups; +extern unsigned int groupSize; extern bool useFloat; extern unsigned int deviceIndex; diff --git a/hip-stream.cpp b/hip-stream.cpp index ddbcae4..6984b44 100644 --- a/hip-stream.cpp +++ b/hip-stream.cpp @@ -43,7 +43,7 @@ #include #include -#include +//#include #include "common.h" std::string getDeviceName(int device); @@ -63,15 +63,75 @@ void check_cuda_error(void) } } + + +// looper function place more work inside each work item. +// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations +template +__global__ void +copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) +{ + int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE; + int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE; + + for (int i=offset; i -__global__ void copy(hipLaunchParm lp, const T * a, T * c) +__global__ void +mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + + for (int i=offset; i +__global__ void +triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +copy(hipLaunchParm lp, const T * a, T * c) { const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; c[i] = a[i]; } + template -__global__ void mul(hipLaunchParm lp, T * b, const T * c) +__global__ void +mul(hipLaunchParm lp, T * b, const T * c) { const T scalar = 3.0; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c) } template -__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c) +__global__ void +add(hipLaunchParm lp, const T * a, const T * b, T * c) { const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; c[i] = a[i] + b[i]; } template -__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c) +__global__ void +triad(hipLaunchParm lp, T * a, const T * b, const T * c) { const T scalar = 3.0; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -100,13 +162,27 @@ int main(int argc, char *argv[]) std::cout << "GPU-STREAM" << std::endl << "Version: " << VERSION_STRING << std::endl - << "Implementation: CUDA" << std::endl; + << "Implementation: HIP" << std::endl; parseArguments(argc, argv); if (NTIMES < 2) throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + // Config grid size and group size for kernel launching + int gridSize; + if (groups) { + gridSize = groups * groupSize; + } else { + gridSize = ARRAY_SIZE; + } + + float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; + std::cout << "GridSize: " << gridSize << " work-items" << std::endl; + std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; + std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; + if (groups) std::cout << "Using looper kernels:" << std::endl; + std::cout << "Precision: "; if (useFloat) std::cout << "float"; else std::cout << "double"; @@ -144,9 +220,10 @@ int main(int argc, char *argv[]) std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << " " << ARRAY_PAD_BYTES << " bytes padding" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB" + << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)" << std::endl; // Reset precision @@ -161,24 +238,31 @@ int main(int argc, char *argv[]) hipSetDevice(deviceIndex); check_cuda_error(); - // Print out device name - std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; - // Print out device CUDA driver version - std::cout << "Driver: " << getDriver() << std::endl; - - // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, deviceIndex); + + // Print out device name + std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl; + + // Print out device HIP driver version + std::cout << "Driver: " << getDriver() << std::endl; + + + + + // Check buffers fit on the device if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - // Create host vectors - void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); + //int cus = props.multiProcessorCount; - // Initilise arrays + // Create host vectors + void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + + // Initialise arrays for (unsigned int i = 0; i < ARRAY_SIZE; i++) { if (useFloat) @@ -196,12 +280,14 @@ int main(int argc, char *argv[]) } // Create device buffers - void * d_a, * d_b, *d_c; - hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); + char * d_a, * d_b, *d_c; + hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); check_cuda_error(); - hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); + hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_b += ARRAY_PAD_BYTES; check_cuda_error(); - hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); + hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_c += ARRAY_PAD_BYTES; check_cuda_error(); // Copy host memory to device @@ -212,10 +298,17 @@ int main(int argc, char *argv[]) hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); check_cuda_error(); + + std::cout << "d_a=" << (void*)d_a << std::endl; + std::cout << "d_b=" << (void*)d_b << std::endl; + std::cout << "d_c=" << (void*)d_c << std::endl; + // Make sure the copies are finished hipDeviceSynchronize(); check_cuda_error(); + + // List of times std::vector< std::vector > timings; @@ -227,10 +320,17 @@ int main(int argc, char *argv[]) { std::vector times; t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -239,10 +339,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -251,10 +358,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -263,10 +377,18 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } + check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -316,8 +438,15 @@ int main(int argc, char *argv[]) } } - for (int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { avg[j] /= (double)(NTIMES-1); + } + + double geomean = 1.0; + for (int j = 0; j < 4; j++) { + geomean *= (sizes[j]/min[j]); + } + geomean = pow(geomean, 0.25); // Display results std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; @@ -339,6 +468,10 @@ int main(int argc, char *argv[]) << std::left << std::setw(12) << std::setprecision(5) << avg[j] << std::endl; } + std::cout + << std::left << std::setw(12) << "GEOMEAN" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean + << std::endl; // Free host vectors free(h_a); diff --git a/runhip.sh b/runhip.sh new file mode 100755 index 0000000..b84e970 --- /dev/null +++ b/runhip.sh @@ -0,0 +1,4 @@ +./gpu-stream-hip +./gpu-stream-hip --groups 64 --groupSize 256 +./gpu-stream-hip --float +./gpu-stream-hip --float --groups 64 --groupSize 256 From 89fec9c8d2186c698018ba07bdea5d3875a28d0d Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 23 Mar 2016 05:26:34 -0500 Subject: [PATCH 03/10] Remove results submission for seperate commits --- results/cuda/nvidia-gtx-titan_x.txt | 15 --------------- results/hip/amd-fiji-nano.txt | 15 --------------- results/hip/nvidia-gtx-titan_x.txt | 22 ---------------------- 3 files changed, 52 deletions(-) delete mode 100644 results/cuda/nvidia-gtx-titan_x.txt delete mode 100644 results/hip/amd-fiji-nano.txt delete mode 100644 results/hip/nvidia-gtx-titan_x.txt diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt deleted file mode 100644 index 2d3b004..0000000 --- a/results/cuda/nvidia-gtx-titan_x.txt +++ /dev/null @@ -1,15 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: CUDA -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) -Total size: 1200.0 MB (=1.2 GB) -Using CUDA device GeForce GTX TITAN X -Driver: 7050 -Function MBytes/sec Min (sec) Max Average -Copy 263155.587 0.00319 0.00319 0.00319 -Mul 262943.430 0.00319 0.00319 0.00319 -Add 268710.444 0.00468 0.00469 0.00469 -Triad 268957.305 0.00468 0.00469 0.00468 diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt deleted file mode 100644 index df40ac8..0000000 --- a/results/hip/amd-fiji-nano.txt +++ /dev/null @@ -1,15 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: CUDA -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) -Total size: 1200.0 MB (=1.2 GB) -Using CUDA device Fiji -Driver: 4 -Function MBytes/sec Min (sec) Max Average -Copy 375822.410 0.00223 0.00225 0.00224 -Mul 375086.879 0.00224 0.00227 0.00224 -Add 425650.718 0.00296 0.00298 0.00297 -Triad 424710.113 0.00296 0.00298 0.00298 diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt deleted file mode 100644 index 6bb94a3..0000000 --- a/results/hip/nvidia-gtx-titan_x.txt +++ /dev/null @@ -1,22 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: HIP -GridSize: 52428800 work-items -GroupSize: 1024 work-items -Operations/Work-item: 1 -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) 0 bytes padding -Total size: 1200.0 MB (=1.2 GB) -Using HIP device GeForce GTX TITAN X (compute_units=24) -Driver: 4 -d_a=0x1306d80000 -d_b=0x131fd80000 -d_c=0x1338d80000 -Function MBytes/sec Min (sec) Max Average -Copy 263042.207 0.00319 0.00320 0.00319 -Mul 262972.033 0.00319 0.00320 0.00319 -Add 268732.653 0.00468 0.00469 0.00469 -Triad 268706.197 0.00468 0.00469 0.00469 -GEOMEAN 265847.929 From 8e9ab4d20a20a31d483eb583d9bd038756d37cad Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 23 Mar 2016 05:29:10 -0500 Subject: [PATCH 04/10] Submit results for NV Titan X with CUDA, AMD FIJI NANO and NV Titan X with HIP --- results/cuda/nvidia-gtx-titan_x.txt | 15 +++++++++++++++ results/hip/amd-fiji-nano.txt | 15 +++++++++++++++ results/hip/nvidia-gtx-titan_x.txt | 22 ++++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 results/cuda/nvidia-gtx-titan_x.txt create mode 100644 results/hip/amd-fiji-nano.txt create mode 100644 results/hip/nvidia-gtx-titan_x.txt diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..2d3b004 --- /dev/null +++ b/results/cuda/nvidia-gtx-titan_x.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device GeForce GTX TITAN X +Driver: 7050 +Function MBytes/sec Min (sec) Max Average +Copy 263155.587 0.00319 0.00319 0.00319 +Mul 262943.430 0.00319 0.00319 0.00319 +Add 268710.444 0.00468 0.00469 0.00469 +Triad 268957.305 0.00468 0.00469 0.00468 diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt new file mode 100644 index 0000000..df40ac8 --- /dev/null +++ b/results/hip/amd-fiji-nano.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device Fiji +Driver: 4 +Function MBytes/sec Min (sec) Max Average +Copy 375822.410 0.00223 0.00225 0.00224 +Mul 375086.879 0.00224 0.00227 0.00224 +Add 425650.718 0.00296 0.00298 0.00297 +Triad 424710.113 0.00296 0.00298 0.00298 diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..6bb94a3 --- /dev/null +++ b/results/hip/nvidia-gtx-titan_x.txt @@ -0,0 +1,22 @@ +GPU-STREAM +Version: 1.0 +Implementation: HIP +GridSize: 52428800 work-items +GroupSize: 1024 work-items +Operations/Work-item: 1 +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) 0 bytes padding +Total size: 1200.0 MB (=1.2 GB) +Using HIP device GeForce GTX TITAN X (compute_units=24) +Driver: 4 +d_a=0x1306d80000 +d_b=0x131fd80000 +d_c=0x1338d80000 +Function MBytes/sec Min (sec) Max Average +Copy 263042.207 0.00319 0.00320 0.00319 +Mul 262972.033 0.00319 0.00320 0.00319 +Add 268732.653 0.00468 0.00469 0.00469 +Triad 268706.197 0.00468 0.00469 0.00469 +GEOMEAN 265847.929 From 207701219a6346bf7f833f9f463fc9f93a6c3971 Mon Sep 17 00:00:00 2001 From: pensun Date: Sun, 3 Apr 2016 06:49:56 -0500 Subject: [PATCH 05/10] Add looper optimization for cuda-stream.cu, remove result files --- cuda-stream.cu | 142 ++++++++++++++++++++++++---- results/cuda/nvidia-gtx-titan_x.txt | 15 --- results/hip/amd-fiji-nano.txt | 15 --- results/hip/nvidia-gtx-titan_x.txt | 22 ----- runcuda.sh | 4 + 5 files changed, 130 insertions(+), 68 deletions(-) delete mode 100644 results/cuda/nvidia-gtx-titan_x.txt delete mode 100644 results/hip/amd-fiji-nano.txt delete mode 100644 results/hip/nvidia-gtx-titan_x.txt create mode 100755 runcuda.sh diff --git a/cuda-stream.cu b/cuda-stream.cu index 2ab3adb..2049eb0 100644 --- a/cuda-stream.cu +++ b/cuda-stream.cu @@ -62,6 +62,59 @@ void check_cuda_error(void) } } +// looper function place more work inside each work item. +// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations +template +__global__ void +copy_looper(const T * a, T * c, int ARRAY_SIZE) +{ + int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE; + int stride = blockDim.x * gridDim.x * CLUMP_SIZE; + + for (int i=offset; i +__global__ void +mul_looper(T * b, const T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i=offset; i +__global__ void +triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + const T scalar = 3.0; + + for (int i=offset; i __global__ void copy(const T * a, T * c) { @@ -106,6 +159,20 @@ int main(int argc, char *argv[]) if (NTIMES < 2) throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + // Config grid size and group size for kernel launching + int gridSize; + if (groups) { + gridSize = groups * groupSize; + } else { + gridSize = ARRAY_SIZE; + } + + float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; + std::cout << "GridSize: " << gridSize << " work-items" << std::endl; + std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; + std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; + if (groups) std::cout << "Using looper kernels:" << std::endl; + std::cout << "Precision: "; if (useFloat) std::cout << "float"; else std::cout << "double"; @@ -211,6 +278,10 @@ int main(int argc, char *argv[]) cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); check_cuda_error(); + std::cout << "d_a=" << (void*)d_a << std::endl; + std::cout << "d_b=" << (void*)d_b << std::endl; + std::cout << "d_c=" << (void*)d_c << std::endl; + // Make sure the copies are finished cudaDeviceSynchronize(); check_cuda_error(); @@ -226,10 +297,18 @@ int main(int argc, char *argv[]) { std::vector times; t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - copy<<>>((float*)d_a, (float*)d_c); - else - copy<<>>((double*)d_a, (double*)d_c); + if (groups) { + if (useFloat) + copy_looper<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); + else + copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + copy<<>>((float*)d_a, (float*)d_c); + else + copy<<>>((double*)d_a, (double*)d_c); + } + check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -238,10 +317,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - mul<<>>((float*)d_b, (float*)d_c); - else - mul<<>>((double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + mul_looper<<>>((float*)d_b, (float*)d_c, ARRAY_SIZE); + else + mul_looper<<>>((double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + mul<<>>((float*)d_b, (float*)d_c); + else + mul<<>>((double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -250,10 +336,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - add<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - add<<>>((double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + add_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + add_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + add<<>>((float*)d_a, (float*)d_b, (float*)d_c); + else + add<<>>((double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -262,10 +355,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + triad_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + triad_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); + else + triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -318,6 +418,12 @@ int main(int argc, char *argv[]) for (int j = 0; j < 4; j++) avg[j] /= (double)(NTIMES-1); + double geomean = 1.0; + for (int j = 0; j < 4; j++) { + geomean *= (sizes[j]/min[j]); + } + geomean = pow(geomean, 0.25); + // Display results std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; std::cout @@ -338,6 +444,10 @@ int main(int argc, char *argv[]) << std::left << std::setw(12) << std::setprecision(5) << avg[j] << std::endl; } + std::cout + << std::left << std::setw(12) << "GEOMEAN" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean + << std::endl; // Free host vectors free(h_a); diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt deleted file mode 100644 index 2d3b004..0000000 --- a/results/cuda/nvidia-gtx-titan_x.txt +++ /dev/null @@ -1,15 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: CUDA -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) -Total size: 1200.0 MB (=1.2 GB) -Using CUDA device GeForce GTX TITAN X -Driver: 7050 -Function MBytes/sec Min (sec) Max Average -Copy 263155.587 0.00319 0.00319 0.00319 -Mul 262943.430 0.00319 0.00319 0.00319 -Add 268710.444 0.00468 0.00469 0.00469 -Triad 268957.305 0.00468 0.00469 0.00468 diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt deleted file mode 100644 index df40ac8..0000000 --- a/results/hip/amd-fiji-nano.txt +++ /dev/null @@ -1,15 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: CUDA -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) -Total size: 1200.0 MB (=1.2 GB) -Using CUDA device Fiji -Driver: 4 -Function MBytes/sec Min (sec) Max Average -Copy 375822.410 0.00223 0.00225 0.00224 -Mul 375086.879 0.00224 0.00227 0.00224 -Add 425650.718 0.00296 0.00298 0.00297 -Triad 424710.113 0.00296 0.00298 0.00298 diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt deleted file mode 100644 index 6bb94a3..0000000 --- a/results/hip/nvidia-gtx-titan_x.txt +++ /dev/null @@ -1,22 +0,0 @@ -GPU-STREAM -Version: 1.0 -Implementation: HIP -GridSize: 52428800 work-items -GroupSize: 1024 work-items -Operations/Work-item: 1 -Precision: double - -Running kernels 10 times -Array size: 400.0 MB (=0.4 GB) 0 bytes padding -Total size: 1200.0 MB (=1.2 GB) -Using HIP device GeForce GTX TITAN X (compute_units=24) -Driver: 4 -d_a=0x1306d80000 -d_b=0x131fd80000 -d_c=0x1338d80000 -Function MBytes/sec Min (sec) Max Average -Copy 263042.207 0.00319 0.00320 0.00319 -Mul 262972.033 0.00319 0.00320 0.00319 -Add 268732.653 0.00468 0.00469 0.00469 -Triad 268706.197 0.00468 0.00469 0.00469 -GEOMEAN 265847.929 diff --git a/runcuda.sh b/runcuda.sh new file mode 100755 index 0000000..7acf5c1 --- /dev/null +++ b/runcuda.sh @@ -0,0 +1,4 @@ +./gpu-stream-cuda +./gpu-stream-cuda --groups 64 --groupSize 256 +./gpu-stream-cuda --float +./gpu-stream-cuda --float --groups 64 --groupSize 256 From d73917ec85f8ef4a0231c8278adc9fc6fdec9f6c Mon Sep 17 00:00:00 2001 From: pensun Date: Sun, 3 Apr 2016 06:50:53 -0500 Subject: [PATCH 06/10] Add cuda results for titan x device. --- results/cuda/nvidia-gtx-titan_x.txt | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 results/cuda/nvidia-gtx-titan_x.txt diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..6bb94a3 --- /dev/null +++ b/results/cuda/nvidia-gtx-titan_x.txt @@ -0,0 +1,22 @@ +GPU-STREAM +Version: 1.0 +Implementation: HIP +GridSize: 52428800 work-items +GroupSize: 1024 work-items +Operations/Work-item: 1 +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) 0 bytes padding +Total size: 1200.0 MB (=1.2 GB) +Using HIP device GeForce GTX TITAN X (compute_units=24) +Driver: 4 +d_a=0x1306d80000 +d_b=0x131fd80000 +d_c=0x1338d80000 +Function MBytes/sec Min (sec) Max Average +Copy 263042.207 0.00319 0.00320 0.00319 +Mul 262972.033 0.00319 0.00320 0.00319 +Add 268732.653 0.00468 0.00469 0.00469 +Triad 268706.197 0.00468 0.00469 0.00469 +GEOMEAN 265847.929 From ef48e0448ae88920779c99e2d841bf3e4b10abdc Mon Sep 17 00:00:00 2001 From: pensun Date: Sun, 3 Apr 2016 06:51:51 -0500 Subject: [PATCH 07/10] Add result of hip on amd FIJI Nano. --- results/hip/amd-fiji-nano.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 results/hip/amd-fiji-nano.txt diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt new file mode 100644 index 0000000..df40ac8 --- /dev/null +++ b/results/hip/amd-fiji-nano.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device Fiji +Driver: 4 +Function MBytes/sec Min (sec) Max Average +Copy 375822.410 0.00223 0.00225 0.00224 +Mul 375086.879 0.00224 0.00227 0.00224 +Add 425650.718 0.00296 0.00298 0.00297 +Triad 424710.113 0.00296 0.00298 0.00298 From e16123222d1bc98e918ca2ed109766b6c0e5fe7d Mon Sep 17 00:00:00 2001 From: pensun Date: Sun, 3 Apr 2016 06:52:31 -0500 Subject: [PATCH 08/10] Add results of HIP on Nvidia Titan X device. --- results/hip/nvidia-gtx-titan_x.txt | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 results/hip/nvidia-gtx-titan_x.txt diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..6bb94a3 --- /dev/null +++ b/results/hip/nvidia-gtx-titan_x.txt @@ -0,0 +1,22 @@ +GPU-STREAM +Version: 1.0 +Implementation: HIP +GridSize: 52428800 work-items +GroupSize: 1024 work-items +Operations/Work-item: 1 +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) 0 bytes padding +Total size: 1200.0 MB (=1.2 GB) +Using HIP device GeForce GTX TITAN X (compute_units=24) +Driver: 4 +d_a=0x1306d80000 +d_b=0x131fd80000 +d_c=0x1338d80000 +Function MBytes/sec Min (sec) Max Average +Copy 263042.207 0.00319 0.00320 0.00319 +Mul 262972.033 0.00319 0.00320 0.00319 +Add 268732.653 0.00468 0.00469 0.00469 +Triad 268706.197 0.00468 0.00469 0.00469 +GEOMEAN 265847.929 From 998985240110f5d402d8bee81dc16bd7a3935eee Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 26 Apr 2016 14:10:32 -0500 Subject: [PATCH 09/10] Remove CLUMP_SIZE options; update warning messege regarding round errors on float that does not apply to HIP version --- common.cpp | 4 ++-- cuda-stream.cu | 10 +++++----- hip-stream.cpp | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/common.cpp b/common.cpp index 122376f..cb94e9e 100644 --- a/common.cpp +++ b/common.cpp @@ -131,12 +131,12 @@ void parseArguments(int argc, char *argv[]) std::cout << "Invalid size" << std::endl; exit(1); } - + } else if (!strcmp(argv[i], "--float")) { useFloat = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; + std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl; } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) { diff --git a/cuda-stream.cu b/cuda-stream.cu index 2049eb0..ea067fb 100644 --- a/cuda-stream.cu +++ b/cuda-stream.cu @@ -64,12 +64,12 @@ void check_cuda_error(void) // looper function place more work inside each work item. // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations -template +template __global__ void copy_looper(const T * a, T * c, int ARRAY_SIZE) { - int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE; - int stride = blockDim.x * gridDim.x * CLUMP_SIZE; + int offset = (blockDim.x * blockIdx.x + threadIdx.x); + int stride = blockDim.x * gridDim.x; for (int i=offset; i<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); + copy_looper<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); else - copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); + copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); } else { if (useFloat) copy<<>>((float*)d_a, (float*)d_c); diff --git a/hip-stream.cpp b/hip-stream.cpp index 6984b44..d5fc133 100644 --- a/hip-stream.cpp +++ b/hip-stream.cpp @@ -67,12 +67,12 @@ void check_cuda_error(void) // looper function place more work inside each work item. // Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations -template +template __global__ void copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) { - int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE; - int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE; + int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x; for (int i=offset; i), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); else - hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); } else { if (useFloat) hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c); From a8ebdc143882b618c5c99f993b0db5b12fdf615f Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 26 Apr 2016 14:21:52 -0500 Subject: [PATCH 10/10] change the warning, stating the rounding error on float does not apply to AMD devices --- common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.cpp b/common.cpp index cb94e9e..3a6c56f 100644 --- a/common.cpp +++ b/common.cpp @@ -136,7 +136,7 @@ void parseArguments(int argc, char *argv[]) else if (!strcmp(argv[i], "--float")) { useFloat = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision on CUDA version" << std::endl; + std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision, not apply to AMD device" << std::endl; } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {