diff --git a/README.md b/README.md index a2f6ab6..c776347 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu- For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ -Clone from the HIP repository in the following link: +Install the HCC compiler: +https://bitbucket.org/multicoreware/hcc/wiki/Home +Install HIP: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' diff --git a/common.cpp b/common.cpp index 781d70e..122376f 100644 --- a/common.cpp +++ b/common.cpp @@ -39,10 +39,13 @@ // Default array size 50 * 2^20 (50*8 Mebibytes double precision) // Use binary powers of two so divides 1024 unsigned int ARRAY_SIZE = 52428800; +size_t ARRAY_PAD_BYTES = 0; unsigned int NTIMES = 10; bool useFloat = false; +unsigned int groups = 0; +unsigned int groupSize = 1024; unsigned int deviceIndex = 0; @@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output) return !strlen(next); } +int parseSize(const char *str, size_t *output) +{ + char *next; + *output = strtoull(str, &next, 0); + int l = strlen(str); + if (l) { + char c = str[l-1]; // last char. + if ((c == 'k') || (c == 'K')) { + *output *= 1024; + } + if ((c == 'm') || (c == 'M')) { + *output *= (1024*1024); + } + + } + return !strlen(next); +} + + void parseArguments(int argc, char *argv[]) { for (int i = 1; i < argc; i++) @@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[]) exit(1); } } + else if (!strcmp(argv[i], "--groups")) + { + if (++i >= argc || !parseUInt(argv[i], &groups)) + { + std::cout << "Invalid group number" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--groupSize")) + { + if (++i >= argc || !parseUInt(argv[i], &groupSize)) + { + std::cout << "Invalid group size" << std::endl; + exit(1); + } + } + else if (!strcmp(argv[i], "--pad")) + { + if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES)) + { + std::cout << "Invalid size" << std::endl; + exit(1); + } + + } else if (!strcmp(argv[i], "--float")) { useFloat = true; @@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[]) std::cout << " --device INDEX Select device at INDEX" << std::endl; std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; + std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl; + std::cout << " --groupSize Set size of each group (default 1024)" << std::endl; + std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << std::endl; exit(0); diff --git a/common.h b/common.h index a4dd886..9cf61d7 100644 --- a/common.h +++ b/common.h @@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]); extern void listDevices(void); extern unsigned int ARRAY_SIZE; +extern size_t ARRAY_PAD_BYTES; extern unsigned int NTIMES; +extern unsigned int groups; +extern unsigned int groupSize; extern bool useFloat; extern unsigned int deviceIndex; diff --git a/hip-stream.cpp b/hip-stream.cpp index ddbcae4..6984b44 100644 --- a/hip-stream.cpp +++ b/hip-stream.cpp @@ -43,7 +43,7 @@ #include #include -#include +//#include #include "common.h" std::string getDeviceName(int device); @@ -63,15 +63,75 @@ void check_cuda_error(void) } } + + +// looper function place more work inside each work item. +// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations +template +__global__ void +copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE) +{ + int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE; + int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE; + + for (int i=offset; i -__global__ void copy(hipLaunchParm lp, const T * a, T * c) +__global__ void +mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + + for (int i=offset; i +__global__ void +triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE) +{ + int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int stride = hipBlockDim_x * hipGridDim_x; + const T scalar = 3.0; + + for (int i=offset; i +__global__ void +copy(hipLaunchParm lp, const T * a, T * c) { const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; c[i] = a[i]; } + template -__global__ void mul(hipLaunchParm lp, T * b, const T * c) +__global__ void +mul(hipLaunchParm lp, T * b, const T * c) { const T scalar = 3.0; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c) } template -__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c) +__global__ void +add(hipLaunchParm lp, const T * a, const T * b, T * c) { const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; c[i] = a[i] + b[i]; } template -__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c) +__global__ void +triad(hipLaunchParm lp, T * a, const T * b, const T * c) { const T scalar = 3.0; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -100,13 +162,27 @@ int main(int argc, char *argv[]) std::cout << "GPU-STREAM" << std::endl << "Version: " << VERSION_STRING << std::endl - << "Implementation: CUDA" << std::endl; + << "Implementation: HIP" << std::endl; parseArguments(argc, argv); if (NTIMES < 2) throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); + // Config grid size and group size for kernel launching + int gridSize; + if (groups) { + gridSize = groups * groupSize; + } else { + gridSize = ARRAY_SIZE; + } + + float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize; + std::cout << "GridSize: " << gridSize << " work-items" << std::endl; + std::cout << "GroupSize: " << groupSize << " work-items" << std::endl; + std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl; + if (groups) std::cout << "Using looper kernels:" << std::endl; + std::cout << "Precision: "; if (useFloat) std::cout << "float"; else std::cout << "double"; @@ -144,9 +220,10 @@ int main(int argc, char *argv[]) std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + << " " << ARRAY_PAD_BYTES << " bytes padding" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" - << " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" + std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB" + << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)" << std::endl; // Reset precision @@ -161,24 +238,31 @@ int main(int argc, char *argv[]) hipSetDevice(deviceIndex); check_cuda_error(); - // Print out device name - std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl; - // Print out device CUDA driver version - std::cout << "Driver: " << getDriver() << std::endl; - - // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, deviceIndex); + + // Print out device name + std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl; + + // Print out device HIP driver version + std::cout << "Driver: " << getDriver() << std::endl; + + + + + // Check buffers fit on the device if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - // Create host vectors - void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE); - void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE); + //int cus = props.multiProcessorCount; - // Initilise arrays + // Create host vectors + void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE ); + + // Initialise arrays for (unsigned int i = 0; i < ARRAY_SIZE; i++) { if (useFloat) @@ -196,12 +280,14 @@ int main(int argc, char *argv[]) } // Create device buffers - void * d_a, * d_b, *d_c; - hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); + char * d_a, * d_b, *d_c; + hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); check_cuda_error(); - hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); + hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_b += ARRAY_PAD_BYTES; check_cuda_error(); - hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); + hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES); + d_c += ARRAY_PAD_BYTES; check_cuda_error(); // Copy host memory to device @@ -212,10 +298,17 @@ int main(int argc, char *argv[]) hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); check_cuda_error(); + + std::cout << "d_a=" << (void*)d_a << std::endl; + std::cout << "d_b=" << (void*)d_b << std::endl; + std::cout << "d_c=" << (void*)d_c << std::endl; + // Make sure the copies are finished hipDeviceSynchronize(); check_cuda_error(); + + // List of times std::vector< std::vector > timings; @@ -227,10 +320,17 @@ int main(int argc, char *argv[]) { std::vector times; t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -239,10 +339,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -251,10 +358,17 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -263,10 +377,18 @@ int main(int argc, char *argv[]) t1 = std::chrono::high_resolution_clock::now(); - if (useFloat) - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); - else - hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + if (groups) { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE); + } else { + if (useFloat) + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); + else + hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); + } + check_cuda_error(); hipDeviceSynchronize(); check_cuda_error(); @@ -316,8 +438,15 @@ int main(int argc, char *argv[]) } } - for (int j = 0; j < 4; j++) + for (int j = 0; j < 4; j++) { avg[j] /= (double)(NTIMES-1); + } + + double geomean = 1.0; + for (int j = 0; j < 4; j++) { + geomean *= (sizes[j]/min[j]); + } + geomean = pow(geomean, 0.25); // Display results std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; @@ -339,6 +468,10 @@ int main(int argc, char *argv[]) << std::left << std::setw(12) << std::setprecision(5) << avg[j] << std::endl; } + std::cout + << std::left << std::setw(12) << "GEOMEAN" + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean + << std::endl; // Free host vectors free(h_a); diff --git a/runhip.sh b/runhip.sh new file mode 100755 index 0000000..b84e970 --- /dev/null +++ b/runhip.sh @@ -0,0 +1,4 @@ +./gpu-stream-hip +./gpu-stream-hip --groups 64 --groupSize 256 +./gpu-stream-hip --float +./gpu-stream-hip --float --groups 64 --groupSize 256