diff --git a/Makefile b/Makefile index 4fb5f7a..c2bc9b2 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,8 @@ ifeq ($(PLATFORM), Darwin) LDLIBS = -framework OpenCL endif -all: gpu-stream-ocl gpu-stream-cuda +all: gpu-stream-ocl gpu-stream-cuda gpu-stream-hip + gpu-stream-ocl: ocl-stream.cpp common.o Makefile $(CXX) $(CXXFLAGS) -Wno-deprecated-declarations common.o $< -o $@ $(LDLIBS) @@ -19,9 +20,22 @@ ifeq ($(shell which nvcc > /dev/null; echo $$?), 0) else $(error "Cannot find nvcc, please install CUDA toolkit") endif +HIP_PATH?=../../.. +HIPCC=$(HIP_PATH)/bin/hipcc + +hip-stream.o : hip-stream.cpp + $(HIPCC) $(CXXFLAGS) -c $< -o $@ + +gpu-stream-hip: hip-stream.o common.o Makefile +ifeq ($(shell which $(HIPCC) > /dev/null; echo $$?), 0) + $(HIPCC) $(CXXFLAGS) common.o $< -lm -o $@ +else + $(error "Cannot find $(HIPCC), please install HIP toolkit") +endif + .PHONY: clean clean: - rm -f gpu-stream-ocl gpu-stream-cuda *.o + rm -f gpu-stream-ocl gpu-stream-cuda gpu-stream-hip *.o diff --git a/README.md b/README.md index 5a84b50..f54874e 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,15 @@ Build the OpenCL and CUDA binaries with `make` (CUDA version requires CUDA >= v6 Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-stream-cuda` +For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: +http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ +Install the HCC compiler: +https://bitbucket.org/multicoreware/hcc/wiki/Home +Install HIP: +https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP + +Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' + Android ------- diff --git a/common.cpp b/common.cpp index 781d70e..3a6c56f 100644 --- a/common.cpp +++ b/common.cpp @@ -39,10 +39,13 @@ // Default array size 50 * 2^20 (50*8 Mebibytes double precision) // Use binary powers of two so divides 1024 unsigned int ARRAY_SIZE = 52428800; +size_t ARRAY_PAD_BYTES = 0; unsigned int NTIMES = 10; bool useFloat = false; +unsigned int groups = 0; +unsigned int groupSize = 1024; unsigned int deviceIndex = 0; @@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output) return !strlen(next); } +int parseSize(const char *str, size_t *output) +{ + char *next; + *output = strtoull(str, &next, 0); + int l = strlen(str); + if (l) { + char c = str[l-1]; // last char. + if ((c == 'k') || (c == 'K')) { + *output *= 1024; + } + if ((c == 'm') || (c == 'M')) { + *output *= (1024*1024); + } + + } + return !strlen(next); +} + + void parseArguments(int argc, char *argv[]) { for (int i = 1; i < argc; i++) @@ -86,10 +108,35 @@ void parseArguments(int argc, char *argv[]) exit(1); } } + else if (!strcmp(argv[i], "--groups")) + { + if (++i >= argc || !parseUInt(argv[i], &groups)) + { + std::cout << "Invalid group number" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--groupSize")) + { + if (++i >= argc || !parseUInt(argv[i], &groupSize)) + { + std::cout << "Invalid group size" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--pad")) + { + if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES)) + { + std::cout << "Invalid size" << std::endl; + exit(1); + } + + } else if (!strcmp(argv[i], "--float")) { useFloat = true; - std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision" << std::endl; + std::cout << "Warning: If number of iterations set >= 8, expect rounding errors with single precision, not apply to AMD device" << std::endl; } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) { @@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[]) std::cout << " --device INDEX Select device at INDEX" << std::endl; std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; + std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl; + std::cout << " --groupSize Set size of each group (default 1024)" << std::endl; + std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << std::endl; exit(0); diff --git a/common.h b/common.h index a4dd886..9cf61d7 100644 --- a/common.h +++ b/common.h @@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]); extern void listDevices(void); extern unsigned int ARRAY_SIZE; +extern size_t ARRAY_PAD_BYTES; extern unsigned int NTIMES; +extern unsigned int groups; +extern unsigned int groupSize; extern bool useFloat; extern unsigned int deviceIndex; diff --git a/cuda-stream.cu b/cuda-stream.cu index 2ab3adb..ea067fb 100644 --- a/cuda-stream.cu +++ b/cuda-stream.cu @@ -62,6 +62,59 @@ void check_cuda_error(void) } } +// looper function place more work inside each work item. +// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations +template +__global__ void +copy_looper(const T * a, T * c, int ARRAY_SIZE) +{ + int offset = (blockDim.x * blockIdx.x + threadIdx.x); + int stride = blockDim.x * gridDim.x; + + for (int i=offset; i +__global__ void +mul_looper(T * b, const T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (int i=offset; i +__global__ void +triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE) +{ + int offset = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + const T scalar = 3.0; + + for (int i=offset; i __global__ void copy(const T * a, T * c) { @@ -106,6 +159,20 @@ int main(int argc, char *argv[]) if (NTIMES < 2) throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + // Config grid size and group size for kernel launching + int gridSize; + if (groups) { + gridSize = groups * groupSize; + } else { + gridSize = ARRAY_SIZE; + } + + float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; + std::cout << "GridSize: " << gridSize << " work-items" << std::endl; + std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; + std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; + if (groups) std::cout << "Using looper kernels:" << std::endl; + std::cout << "Precision: "; if (useFloat) std::cout << "float"; else std::cout << "double"; @@ -211,6 +278,10 @@ int main(int argc, char *argv[]) cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice); check_cuda_error(); + std::cout << "d_a=" << (void*)d_a << std::endl; + std::cout << "d_b=" << (void*)d_b << std::endl; + std::cout << "d_c=" << (void*)d_c << std::endl; + // Make sure the copies are finished cudaDeviceSynchronize(); check_cuda_error(); @@ -226,10 +297,18 @@ int main(int argc, char *argv[]) { std::vector times; t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - copy<<>>((float*)d_a, (float*)d_c); - else - copy<<>>((double*)d_a, (double*)d_c); + if (groups) { + if (useFloat) + copy_looper<<>>((float*)d_a, (float*)d_c, ARRAY_SIZE); + else + copy_looper<<>>((double*)d_a, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + copy<<>>((float*)d_a, (float*)d_c); + else + copy<<>>((double*)d_a, (double*)d_c); + } + check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -238,10 +317,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - mul<<>>((float*)d_b, (float*)d_c); - else - mul<<>>((double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + mul_looper<<>>((float*)d_b, (float*)d_c, ARRAY_SIZE); + else + mul_looper<<>>((double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + mul<<>>((float*)d_b, (float*)d_c); + else + mul<<>>((double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -250,10 +336,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - add<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - add<<>>((double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + add_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + add_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + add<<>>((float*)d_a, (float*)d_b, (float*)d_c); + else + add<<>>((double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -262,10 +355,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); - else - triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + triad_looper<<>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + triad_looper<<>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + triad<<>>((float*)d_a, (float*)d_b, (float*)d_c); + else + triad<<>>((double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); cudaDeviceSynchronize(); check_cuda_error(); @@ -318,6 +418,12 @@ int main(int argc, char *argv[]) for (int j = 0; j < 4; j++) avg[j] /= (double)(NTIMES-1); + double geomean = 1.0; + for (int j = 0; j < 4; j++) { + geomean *= (sizes[j]/min[j]); + } + geomean = pow(geomean, 0.25); + // Display results std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; std::cout @@ -338,6 +444,10 @@ int main(int argc, char *argv[]) << std::left << std::setw(12) << std::setprecision(5) << avg[j] << std::endl; } + std::cout + << std::left << std::setw(12) << "GEOMEAN" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean + << std::endl; // Free host vectors free(h_a); diff --git a/hip-stream.cpp b/hip-stream.cpp new file mode 100644 index 0000000..d5fc133 --- /dev/null +++ b/hip-stream.cpp @@ -0,0 +1,531 @@ +#include "hip_runtime.h" +/*============================================================================= +*------------------------------------------------------------------------------ +* Copyright 2015: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC +* Based on John D. McCalpin’s original STREAM benchmark for CPUs +*------------------------------------------------------------------------------ +* License: +* 1. You are free to use this program and/or to redistribute +* this program. +* 2. You are free to modify this program for your own use, +* including commercial use, subject to the publication +* restrictions in item 3. +* 3. You are free to publish results obtained from running this +* program, or from works that you derive from this program, +* with the following limitations: +* 3a. In order to be referred to as "GPU-STREAM benchmark results", +* published results must be in conformance to the GPU-STREAM +* Run Rules published at +* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules +* and incorporated herein by reference. +* The copyright holders retain the +* right to determine conformity with the Run Rules. +* 3b. Results based on modified source code or on runs not in +* accordance with the GPU-STREAM Run Rules must be clearly +* labelled whenever they are published. Examples of +* proper labelling include: +* "tuned GPU-STREAM benchmark results" +* "based on a variant of the GPU-STREAM benchmark code" +* Other comparable, clear and reasonable labelling is +* acceptable. +* 3c. Submission of results to the GPU-STREAM benchmark web site +* is encouraged, but not required. +* 4. Use of this program or creation of derived works based on this +* program constitutes acceptance of these licensing restrictions. +* 5. Absolutely no warranty is expressed or implied. +*———————————————————————————————————-----------------------------------------*/ + + +#include +#include +#include +#include +#include +#include + +//#include +#include "common.h" + +std::string getDeviceName(int device); +int getDriver(void); + +// Code to check CUDA errors +void check_cuda_error(void) +{ + hipError_t err = hipGetLastError(); + if (err != hipSuccess) + { + std::cerr + << "Error: " + << hipGetErrorString(err) + << std::endl; + exit(err); + } +} + + + +// looper function place more work inside each work item. +// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations +template +__global__ void +copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) +{ + int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x; + + for (int i=offset; i +__global__ void +mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + + for (int i=offset; i +__global__ void +triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +copy(hipLaunchParm lp, const T * a, T * c) +{ + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + c[i] = a[i]; +} + + +template +__global__ void +mul(hipLaunchParm lp, T * b, const T * c) +{ + const T scalar = 3.0; + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + b[i] = scalar * c[i]; +} + +template +__global__ void +add(hipLaunchParm lp, const T * a, const T * b, T * c) +{ + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + c[i] = a[i] + b[i]; +} + +template +__global__ void +triad(hipLaunchParm lp, T * a, const T * b, const T * c) +{ + const T scalar = 3.0; + const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + a[i] = b[i] + scalar * c[i]; +} + +int main(int argc, char *argv[]) +{ + + // Print out run information + std::cout + << "GPU-STREAM" << std::endl + << "Version: " << VERSION_STRING << std::endl + << "Implementation: HIP" << std::endl; + + parseArguments(argc, argv); + + if (NTIMES < 2) + throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + + // Config grid size and group size for kernel launching + int gridSize; + if (groups) { + gridSize = groups * groupSize; + } else { + gridSize = ARRAY_SIZE; + } + + float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; + std::cout << "GridSize: " << gridSize << " work-items" << std::endl; + std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; + std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; + if (groups) std::cout << "Using looper kernels:" << std::endl; + + std::cout << "Precision: "; + if (useFloat) std::cout << "float"; + else std::cout << "double"; + std::cout << std::endl << std::endl; + + std::cout << "Running kernels " << NTIMES << " times" << std::endl; + + if (ARRAY_SIZE % 1024 != 0) + { + unsigned int OLD_ARRAY_SIZE = ARRAY_SIZE; + ARRAY_SIZE -= ARRAY_SIZE % 1024; + std::cout + << "Warning: array size must divide 1024" << std::endl + << "Resizing array from " << OLD_ARRAY_SIZE + << " to " << ARRAY_SIZE << std::endl; + if (ARRAY_SIZE == 0) + throw std::runtime_error("Array size must be >= 1024"); + } + + // Get precision (used to reset later) + std::streamsize ss = std::cout.precision(); + + size_t DATATYPE_SIZE; + + if (useFloat) + { + DATATYPE_SIZE = sizeof(float); + } + else + { + DATATYPE_SIZE = sizeof(double); + } + + // Display number of bytes in array + std::cout << std::setprecision(1) << std::fixed + << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" + << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << " " << ARRAY_PAD_BYTES << " bytes padding" + << std::endl; + std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB" + << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)" + << std::endl; + + // Reset precision + std::cout.precision(ss); + + // Check device index is in range + int count; + hipGetDeviceCount(&count); + check_cuda_error(); + if (deviceIndex >= count) + throw std::runtime_error("Chosen device index is invalid"); + hipSetDevice(deviceIndex); + check_cuda_error(); + + + hipDeviceProp_t props; + hipGetDeviceProperties(&props, deviceIndex); + + // Print out device name + std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl; + + // Print out device HIP driver version + std::cout << "Driver: " << getDriver() << std::endl; + + + + + // Check buffers fit on the device + if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + //int cus = props.multiProcessorCount; + + // Create host vectors + void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + + // Initialise arrays + for (unsigned int i = 0; i < ARRAY_SIZE; i++) + { + if (useFloat) + { + ((float*)h_a)[i] = 1.0f; + ((float*)h_b)[i] = 2.0f; + ((float*)h_c)[i] = 0.0f; + } + else + { + ((double*)h_a)[i] = 1.0; + ((double*)h_b)[i] = 2.0; + ((double*)h_c)[i] = 0.0; + } + } + + // Create device buffers + char * d_a, * d_b, *d_c; + hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + check_cuda_error(); + hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_b += ARRAY_PAD_BYTES; + check_cuda_error(); + hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_c += ARRAY_PAD_BYTES; + check_cuda_error(); + + // Copy host memory to device + hipMemcpy(d_a, h_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + hipMemcpy(d_b, h_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); + check_cuda_error(); + + + std::cout << "d_a=" << (void*)d_a << std::endl; + std::cout << "d_b=" << (void*)d_b << std::endl; + std::cout << "d_c=" << (void*)d_c << std::endl; + + // Make sure the copies are finished + hipDeviceSynchronize(); + check_cuda_error(); + + + + // List of times + std::vector< std::vector > timings; + + // Declare timers + std::chrono::high_resolution_clock::time_point t1, t2; + + // Main loop + for (unsigned int k = 0; k < NTIMES; k++) + { + std::vector times; + t1 = std::chrono::high_resolution_clock::now(); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c); + } + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c); + } + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + + t1 = std::chrono::high_resolution_clock::now(); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } + + check_cuda_error(); + hipDeviceSynchronize(); + check_cuda_error(); + t2 = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration_cast >(t2 - t1).count()); + + timings.push_back(times); + + } + + // Check solutions + hipMemcpy(h_a, d_a, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + hipMemcpy(h_b, d_b, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + hipMemcpy(h_c, d_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyDeviceToHost); + check_cuda_error(); + + if (useFloat) + { + check_solution(h_a, h_b, h_c); + } + else + { + check_solution(h_a, h_b, h_c); + } + + // Crunch results + size_t sizes[4] = { + 2 * DATATYPE_SIZE * ARRAY_SIZE, + 2 * DATATYPE_SIZE * ARRAY_SIZE, + 3 * DATATYPE_SIZE * ARRAY_SIZE, + 3 * DATATYPE_SIZE * ARRAY_SIZE + }; + double min[4] = {DBL_MAX, DBL_MAX, DBL_MAX, DBL_MAX}; + double max[4] = {0.0, 0.0, 0.0, 0.0}; + double avg[4] = {0.0, 0.0, 0.0, 0.0}; + + // Ignore first result + for (unsigned int i = 1; i < NTIMES; i++) + { + for (int j = 0; j < 4; j++) + { + avg[j] += timings[i][j]; + min[j] = std::min(min[j], timings[i][j]); + max[j] = std::max(max[j], timings[i][j]); + } + } + + for (int j = 0; j < 4; j++) { + avg[j] /= (double)(NTIMES-1); + } + + double geomean = 1.0; + for (int j = 0; j < 4; j++) { + geomean *= (sizes[j]/min[j]); + } + geomean = pow(geomean, 0.25); + + // Display results + std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; + std::cout + << std::left << std::setw(12) << "Function" + << std::left << std::setw(12) << "MBytes/sec" + << std::left << std::setw(12) << "Min (sec)" + << std::left << std::setw(12) << "Max" + << std::left << std::setw(12) << "Average" + << std::endl; + + for (int j = 0; j < 4; j++) + { + std::cout + << std::left << std::setw(12) << labels[j] + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * sizes[j]/min[j] + << std::left << std::setw(12) << std::setprecision(5) << min[j] + << std::left << std::setw(12) << std::setprecision(5) << max[j] + << std::left << std::setw(12) << std::setprecision(5) << avg[j] + << std::endl; + } + std::cout + << std::left << std::setw(12) << "GEOMEAN" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean + << std::endl; + + // Free host vectors + free(h_a); + free(h_b); + free(h_c); + + // Free cuda buffers + hipFree(d_a); + check_cuda_error(); + hipFree(d_b); + check_cuda_error(); + hipFree(d_c); + check_cuda_error(); + +} + +std::string getDeviceName(int device) +{ + struct hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, device); + check_cuda_error(); + return std::string(prop.name); +} + +int getDriver(void) +{ + int driver; + hipDriverGetVersion(&driver); + check_cuda_error(); + return driver; +} + +void listDevices(void) +{ + // Get number of devices + int count; + hipGetDeviceCount(&count); + check_cuda_error(); + + // Print device names + if (count == 0) + { + std::cout << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + check_cuda_error(); + } + std::cout << std::endl; + } +} + diff --git a/results/cuda/nvidia-gtx-titan_x.txt b/results/cuda/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..6bb94a3 --- /dev/null +++ b/results/cuda/nvidia-gtx-titan_x.txt @@ -0,0 +1,22 @@ +GPU-STREAM +Version: 1.0 +Implementation: HIP +GridSize: 52428800 work-items +GroupSize: 1024 work-items +Operations/Work-item: 1 +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) 0 bytes padding +Total size: 1200.0 MB (=1.2 GB) +Using HIP device GeForce GTX TITAN X (compute_units=24) +Driver: 4 +d_a=0x1306d80000 +d_b=0x131fd80000 +d_c=0x1338d80000 +Function MBytes/sec Min (sec) Max Average +Copy 263042.207 0.00319 0.00320 0.00319 +Mul 262972.033 0.00319 0.00320 0.00319 +Add 268732.653 0.00468 0.00469 0.00469 +Triad 268706.197 0.00468 0.00469 0.00469 +GEOMEAN 265847.929 diff --git a/results/hip/amd-fiji-nano.txt b/results/hip/amd-fiji-nano.txt new file mode 100644 index 0000000..df40ac8 --- /dev/null +++ b/results/hip/amd-fiji-nano.txt @@ -0,0 +1,15 @@ +GPU-STREAM +Version: 1.0 +Implementation: CUDA +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) +Total size: 1200.0 MB (=1.2 GB) +Using CUDA device Fiji +Driver: 4 +Function MBytes/sec Min (sec) Max Average +Copy 375822.410 0.00223 0.00225 0.00224 +Mul 375086.879 0.00224 0.00227 0.00224 +Add 425650.718 0.00296 0.00298 0.00297 +Triad 424710.113 0.00296 0.00298 0.00298 diff --git a/results/hip/nvidia-gtx-titan_x.txt b/results/hip/nvidia-gtx-titan_x.txt new file mode 100644 index 0000000..6bb94a3 --- /dev/null +++ b/results/hip/nvidia-gtx-titan_x.txt @@ -0,0 +1,22 @@ +GPU-STREAM +Version: 1.0 +Implementation: HIP +GridSize: 52428800 work-items +GroupSize: 1024 work-items +Operations/Work-item: 1 +Precision: double + +Running kernels 10 times +Array size: 400.0 MB (=0.4 GB) 0 bytes padding +Total size: 1200.0 MB (=1.2 GB) +Using HIP device GeForce GTX TITAN X (compute_units=24) +Driver: 4 +d_a=0x1306d80000 +d_b=0x131fd80000 +d_c=0x1338d80000 +Function MBytes/sec Min (sec) Max Average +Copy 263042.207 0.00319 0.00320 0.00319 +Mul 262972.033 0.00319 0.00320 0.00319 +Add 268732.653 0.00468 0.00469 0.00469 +Triad 268706.197 0.00468 0.00469 0.00469 +GEOMEAN 265847.929 diff --git a/runcuda.sh b/runcuda.sh new file mode 100755 index 0000000..7acf5c1 --- /dev/null +++ b/runcuda.sh @@ -0,0 +1,4 @@ +./gpu-stream-cuda +./gpu-stream-cuda --groups 64 --groupSize 256 +./gpu-stream-cuda --float +./gpu-stream-cuda --float --groups 64 --groupSize 256 diff --git a/runhip.sh b/runhip.sh new file mode 100755 index 0000000..b84e970 --- /dev/null +++ b/runhip.sh @@ -0,0 +1,4 @@ +./gpu-stream-hip +./gpu-stream-hip --groups 64 --groupSize 256 +./gpu-stream-hip --float +./gpu-stream-hip --float --groups 64 --groupSize 256