From 11053798ff7dca2bd8eb7a8e5bacac10758b5159 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Tue, 15 Mar 2016 07:56:32 -0500 Subject: [PATCH] Improved GPU-STREAM benchmark for HIP version: 1. Add optional looper kernels to take command line input for the number of groups and groupSize 2. Add GEOMEAN value calculation of the kernels 3. Instructions on configure HIP environment in the README.md 4. Add results for HIP on FIJI Nano, TITAN X; CUDA on TITAN X 5. Run script to optionally run HIP version with groups and groupSize options --- README.md | 4 +- common.cpp | 50 ++++++++++++ common.h | 3 + hip-stream.cpp | 213 +++++++++++++++++++++++++++++++++++++++---------- runhip.sh | 4 + 5 files changed, 233 insertions(+), 41 deletions(-) create mode 100755 runhip.sh diff --git a/README.md b/README.md index a2f6ab6..c776347 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu- For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ -Clone from the HIP repository in the following link: +Install the HCC compiler: +https://bitbucket.org/multicoreware/hcc/wiki/Home +Install HIP: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' diff --git a/common.cpp b/common.cpp index 781d70e..122376f 100644 --- a/common.cpp +++ b/common.cpp @@ -39,10 +39,13 @@ // Default array size 50 * 2^20 (50*8 Mebibytes double precision) // Use binary powers of two so divides 1024 unsigned int ARRAY_SIZE = 52428800; +size_t ARRAY_PAD_BYTES = 0; unsigned int NTIMES = 10; bool useFloat = false; +unsigned int groups = 0; +unsigned int groupSize = 1024; unsigned int deviceIndex = 0; @@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output) return !strlen(next); } +int parseSize(const char *str, size_t *output) +{ + char *next; + *output = strtoull(str, &next, 0); + int l = strlen(str); + if (l) { + char c = str[l-1]; // last char. + if ((c == 'k') || (c == 'K')) { + *output *= 1024; + } + if ((c == 'm') || (c == 'M')) { + *output *= (1024*1024); + } + + } + return !strlen(next); +} + + void parseArguments(int argc, char *argv[]) { for (int i = 1; i < argc; i++) @@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[]) exit(1); } } + else if (!strcmp(argv[i], "--groups")) + { + if (++i >= argc || !parseUInt(argv[i], &groups)) + { + std::cout << "Invalid group number" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--groupSize")) + { + if (++i >= argc || !parseUInt(argv[i], &groupSize)) + { + std::cout << "Invalid group size" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--pad")) + { + if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES)) + { + std::cout << "Invalid size" << std::endl; + exit(1); + } + + } else if (!strcmp(argv[i], "--float")) { useFloat = true; @@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[]) std::cout << " --device INDEX Select device at INDEX" << std::endl; std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; + std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl; + std::cout << " --groupSize Set size of each group (default 1024)" << std::endl; + std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << std::endl; exit(0); diff --git a/common.h b/common.h index a4dd886..9cf61d7 100644 --- a/common.h +++ b/common.h @@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]); extern void listDevices(void); extern unsigned int ARRAY_SIZE; +extern size_t ARRAY_PAD_BYTES; extern unsigned int NTIMES; +extern unsigned int groups; +extern unsigned int groupSize; extern bool useFloat; extern unsigned int deviceIndex; diff --git a/hip-stream.cpp b/hip-stream.cpp index ddbcae4..6984b44 100644 --- a/hip-stream.cpp +++ b/hip-stream.cpp @@ -43,7 +43,7 @@ #include #include -#include +//#include #include "common.h" std::string getDeviceName(int device); @@ -63,15 +63,75 @@ void check_cuda_error(void) } } + + +// looper function place more work inside each work item. +// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations +template +__global__ void +copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) +{ + int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE; + int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE; + + for (int i=offset; i -__global__ void copy(hipLaunchParm lp, const T * a, T * c) +__global__ void +mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + + for (int i=offset; i +__global__ void +triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +copy(hipLaunchParm lp, const T * a, T * c) { const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; c[i] = a[i]; } + template -__global__ void mul(hipLaunchParm lp, T * b, const T * c) +__global__ void +mul(hipLaunchParm lp, T * b, const T * c) { const T scalar = 3.0; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c) } template -__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c) +__global__ void +add(hipLaunchParm lp, const T * a, const T * b, T * c) { const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; c[i] = a[i] + b[i]; } template -__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c) +__global__ void +triad(hipLaunchParm lp, T * a, const T * b, const T * c) { const T scalar = 3.0; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -100,13 +162,27 @@ int main(int argc, char *argv[]) std::cout << "GPU-STREAM" << std::endl << "Version: " << VERSION_STRING << std::endl - << "Implementation: CUDA" << std::endl; + << "Implementation: HIP" << std::endl; parseArguments(argc, argv); if (NTIMES < 2) throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + // Config grid size and group size for kernel launching + int gridSize; + if (groups) { + gridSize = groups * groupSize; + } else { + gridSize = ARRAY_SIZE; + } + + float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; + std::cout << "GridSize: " << gridSize << " work-items" << std::endl; + std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; + std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; + if (groups) std::cout << "Using looper kernels:" << std::endl; + std::cout << "Precision: "; if (useFloat) std::cout << "float"; else std::cout << "double"; @@ -144,9 +220,10 @@ int main(int argc, char *argv[]) std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << " " << ARRAY_PAD_BYTES << " bytes padding" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB" + << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)" << std::endl; // Reset precision @@ -161,24 +238,31 @@ int main(int argc, char *argv[]) hipSetDevice(deviceIndex); check_cuda_error(); - // Print out device name - std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; - // Print out device CUDA driver version - std::cout << "Driver: " << getDriver() << std::endl; - - // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, deviceIndex); + + // Print out device name + std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl; + + // Print out device HIP driver version + std::cout << "Driver: " << getDriver() << std::endl; + + + + + // Check buffers fit on the device if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - // Create host vectors - void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); + //int cus = props.multiProcessorCount; - // Initilise arrays + // Create host vectors + void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + + // Initialise arrays for (unsigned int i = 0; i < ARRAY_SIZE; i++) { if (useFloat) @@ -196,12 +280,14 @@ int main(int argc, char *argv[]) } // Create device buffers - void * d_a, * d_b, *d_c; - hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); + char * d_a, * d_b, *d_c; + hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); check_cuda_error(); - hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); + hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_b += ARRAY_PAD_BYTES; check_cuda_error(); - hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); + hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_c += ARRAY_PAD_BYTES; check_cuda_error(); // Copy host memory to device @@ -212,10 +298,17 @@ int main(int argc, char *argv[]) hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); check_cuda_error(); + + std::cout << "d_a=" << (void*)d_a << std::endl; + std::cout << "d_b=" << (void*)d_b << std::endl; + std::cout << "d_c=" << (void*)d_c << std::endl; + // Make sure the copies are finished hipDeviceSynchronize(); check_cuda_error(); + + // List of times std::vector< std::vector > timings; @@ -227,10 +320,17 @@ int main(int argc, char *argv[]) { std::vector times; t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -239,10 +339,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -251,10 +358,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -263,10 +377,18 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } + check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -316,8 +438,15 @@ int main(int argc, char *argv[]) } } - for (int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { avg[j] /= (double)(NTIMES-1); + } + + double geomean = 1.0; + for (int j = 0; j < 4; j++) { + geomean *= (sizes[j]/min[j]); + } + geomean = pow(geomean, 0.25); // Display results std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; @@ -339,6 +468,10 @@ int main(int argc, char *argv[]) << std::left << std::setw(12) << std::setprecision(5) << avg[j] << std::endl; } + std::cout + << std::left << std::setw(12) << "GEOMEAN" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean + << std::endl; // Free host vectors free(h_a); diff --git a/runhip.sh b/runhip.sh new file mode 100755 index 0000000..b84e970 --- /dev/null +++ b/runhip.sh @@ -0,0 +1,4 @@ +./gpu-stream-hip +./gpu-stream-hip --groups 64 --groupSize 256 +./gpu-stream-hip --float +./gpu-stream-hip --float --groups 64 --groupSize 256