Improved GPU-STREAM benchmark for HIP version:

1. Add optional looper kernels to take command line input for the number of groups and groupSize
2. Add GEOMEAN value calculation of the kernels
3. Instructions on configure HIP environment in the README.md
4. Add results for HIP on FIJI Nano, TITAN X; CUDA on TITAN X
5. Run script to optionally run HIP version with groups and groupSize options
This commit is contained in:
sunway513 2016-03-15 07:56:32 -05:00
parent fdeb20601f
commit 11053798ff
5 changed files with 233 additions and 41 deletions

View File

@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-
For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers: For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/ http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
Clone from the HIP repository in the following link: Install the HCC compiler:
https://bitbucket.org/multicoreware/hcc/wiki/Home
Install HIP:
https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip' Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'

View File

@ -39,10 +39,13 @@
// Default array size 50 * 2^20 (50*8 Mebibytes double precision) // Default array size 50 * 2^20 (50*8 Mebibytes double precision)
// Use binary powers of two so divides 1024 // Use binary powers of two so divides 1024
unsigned int ARRAY_SIZE = 52428800; unsigned int ARRAY_SIZE = 52428800;
size_t ARRAY_PAD_BYTES = 0;
unsigned int NTIMES = 10; unsigned int NTIMES = 10;
bool useFloat = false; bool useFloat = false;
unsigned int groups = 0;
unsigned int groupSize = 1024;
unsigned int deviceIndex = 0; unsigned int deviceIndex = 0;
@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output)
return !strlen(next); return !strlen(next);
} }
int parseSize(const char *str, size_t *output)
{
char *next;
*output = strtoull(str, &next, 0);
int l = strlen(str);
if (l) {
char c = str[l-1]; // last char.
if ((c == 'k') || (c == 'K')) {
*output *= 1024;
}
if ((c == 'm') || (c == 'M')) {
*output *= (1024*1024);
}
}
return !strlen(next);
}
void parseArguments(int argc, char *argv[]) void parseArguments(int argc, char *argv[])
{ {
for (int i = 1; i < argc; i++) for (int i = 1; i < argc; i++)
@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[])
exit(1); exit(1);
} }
} }
else if (!strcmp(argv[i], "--groups"))
{
if (++i >= argc || !parseUInt(argv[i], &groups))
{
std::cout << "Invalid group number" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--groupSize"))
{
if (++i >= argc || !parseUInt(argv[i], &groupSize))
{
std::cout << "Invalid group size" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--pad"))
{
if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
{
std::cout << "Invalid size" << std::endl;
exit(1);
}
}
else if (!strcmp(argv[i], "--float")) else if (!strcmp(argv[i], "--float"))
{ {
useFloat = true; useFloat = true;
@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[])
std::cout << " --device INDEX Select device at INDEX" << std::endl; std::cout << " --device INDEX Select device at INDEX" << std::endl;
std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl;
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl;
std::cout << " --groupSize Set size of each group (default 1024)" << std::endl;
std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
std::cout << " --float Use floats (rather than doubles)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl;
std::cout << std::endl; std::cout << std::endl;
exit(0); exit(0);

View File

@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]);
extern void listDevices(void); extern void listDevices(void);
extern unsigned int ARRAY_SIZE; extern unsigned int ARRAY_SIZE;
extern size_t ARRAY_PAD_BYTES;
extern unsigned int NTIMES; extern unsigned int NTIMES;
extern unsigned int groups;
extern unsigned int groupSize;
extern bool useFloat; extern bool useFloat;
extern unsigned int deviceIndex; extern unsigned int deviceIndex;

View File

@ -43,7 +43,7 @@
#include <cfloat> #include <cfloat>
#include <cmath> #include <cmath>
#include <cuda.h> //#include <cuda.h>
#include "common.h" #include "common.h"
std::string getDeviceName(int device); std::string getDeviceName(int device);
@ -63,15 +63,75 @@ void check_cuda_error(void)
} }
} }
// looper function place more work inside each work item.
// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
template <typename T, int CLUMP_SIZE>
__global__ void
copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE)
{
int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i];
}
}
template <typename T> template <typename T>
__global__ void copy(hipLaunchParm lp, const T * a, T * c) __global__ void
mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE)
{
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int stride = hipBlockDim_x * hipGridDim_x;
const T scalar = 3.0;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
b[i] = scalar * c[i];
}
}
template <typename T>
__global__ void
add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE)
{
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int stride = hipBlockDim_x * hipGridDim_x;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
c[i] = a[i] + b[i];
}
}
template <typename T>
__global__ void
triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE)
{
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int stride = hipBlockDim_x * hipGridDim_x;
const T scalar = 3.0;
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
a[i] = b[i] + scalar * c[i];
}
}
template <typename T>
__global__ void
copy(hipLaunchParm lp, const T * a, T * c)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
c[i] = a[i]; c[i] = a[i];
} }
template <typename T> template <typename T>
__global__ void mul(hipLaunchParm lp, T * b, const T * c) __global__ void
mul(hipLaunchParm lp, T * b, const T * c)
{ {
const T scalar = 3.0; const T scalar = 3.0;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c)
} }
template <typename T> template <typename T>
__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c) __global__ void
add(hipLaunchParm lp, const T * a, const T * b, T * c)
{ {
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
c[i] = a[i] + b[i]; c[i] = a[i] + b[i];
} }
template <typename T> template <typename T>
__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c) __global__ void
triad(hipLaunchParm lp, T * a, const T * b, const T * c)
{ {
const T scalar = 3.0; const T scalar = 3.0;
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@ -100,13 +162,27 @@ int main(int argc, char *argv[])
std::cout std::cout
<< "GPU-STREAM" << std::endl << "GPU-STREAM" << std::endl
<< "Version: " << VERSION_STRING << std::endl << "Version: " << VERSION_STRING << std::endl
<< "Implementation: CUDA" << std::endl; << "Implementation: HIP" << std::endl;
parseArguments(argc, argv); parseArguments(argc, argv);
if (NTIMES < 2) if (NTIMES < 2)
throw std::runtime_error("Chosen number of times is invalid, must be >= 2"); throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
// Config grid size and group size for kernel launching
int gridSize;
if (groups) {
gridSize = groups * groupSize;
} else {
gridSize = ARRAY_SIZE;
}
float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
if (groups) std::cout << "Using looper kernels:" << std::endl;
std::cout << "Precision: "; std::cout << "Precision: ";
if (useFloat) std::cout << "float"; if (useFloat) std::cout << "float";
else std::cout << "double"; else std::cout << "double";
@ -144,9 +220,10 @@ int main(int argc, char *argv[])
std::cout << std::setprecision(1) << std::fixed std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" << "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
<< " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" << " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
<< " " << ARRAY_PAD_BYTES << " bytes padding"
<< std::endl; << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB" std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
<< " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)" << " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
<< std::endl; << std::endl;
// Reset precision // Reset precision
@ -161,24 +238,31 @@ int main(int argc, char *argv[])
hipSetDevice(deviceIndex); hipSetDevice(deviceIndex);
check_cuda_error(); check_cuda_error();
// Print out device name
std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
// Print out device CUDA driver version
std::cout << "Driver: " << getDriver() << std::endl;
// Check buffers fit on the device
hipDeviceProp_t props; hipDeviceProp_t props;
hipGetDeviceProperties(&props, deviceIndex); hipGetDeviceProperties(&props, deviceIndex);
// Print out device name
std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
// Print out device HIP driver version
std::cout << "Driver: " << getDriver() << std::endl;
// Check buffers fit on the device
if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE) if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
throw std::runtime_error("Device does not have enough memory for all 3 buffers"); throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create host vectors //int cus = props.multiProcessorCount;
void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
// Initilise arrays // Create host vectors
void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
// Initialise arrays
for (unsigned int i = 0; i < ARRAY_SIZE; i++) for (unsigned int i = 0; i < ARRAY_SIZE; i++)
{ {
if (useFloat) if (useFloat)
@ -196,12 +280,14 @@ int main(int argc, char *argv[])
} }
// Create device buffers // Create device buffers
void * d_a, * d_b, *d_c; char * d_a, * d_b, *d_c;
hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE); hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
check_cuda_error(); check_cuda_error();
hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE); hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
d_b += ARRAY_PAD_BYTES;
check_cuda_error(); check_cuda_error();
hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE); hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
d_c += ARRAY_PAD_BYTES;
check_cuda_error(); check_cuda_error();
// Copy host memory to device // Copy host memory to device
@ -212,10 +298,17 @@ int main(int argc, char *argv[])
hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice); hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
check_cuda_error(); check_cuda_error();
std::cout << "d_a=" << (void*)d_a << std::endl;
std::cout << "d_b=" << (void*)d_b << std::endl;
std::cout << "d_c=" << (void*)d_c << std::endl;
// Make sure the copies are finished // Make sure the copies are finished
hipDeviceSynchronize(); hipDeviceSynchronize();
check_cuda_error(); check_cuda_error();
// List of times // List of times
std::vector< std::vector<double> > timings; std::vector< std::vector<double> > timings;
@ -227,10 +320,17 @@ int main(int argc, char *argv[])
{ {
std::vector<double> times; std::vector<double> times;
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
if (useFloat) if (groups) {
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c); if (useFloat)
else hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c); else
hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
}
check_cuda_error(); check_cuda_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_cuda_error(); check_cuda_error();
@ -239,10 +339,17 @@ int main(int argc, char *argv[])
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
if (useFloat) if (groups) {
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c); if (useFloat)
else hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c); else
hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
}
check_cuda_error(); check_cuda_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_cuda_error(); check_cuda_error();
@ -251,10 +358,17 @@ int main(int argc, char *argv[])
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
if (useFloat) if (groups) {
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); if (useFloat)
else hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); else
hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
}
check_cuda_error(); check_cuda_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_cuda_error(); check_cuda_error();
@ -263,10 +377,18 @@ int main(int argc, char *argv[])
t1 = std::chrono::high_resolution_clock::now(); t1 = std::chrono::high_resolution_clock::now();
if (useFloat) if (groups) {
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c); if (useFloat)
else hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c); else
hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
} else {
if (useFloat)
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
else
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
}
check_cuda_error(); check_cuda_error();
hipDeviceSynchronize(); hipDeviceSynchronize();
check_cuda_error(); check_cuda_error();
@ -316,8 +438,15 @@ int main(int argc, char *argv[])
} }
} }
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++) {
avg[j] /= (double)(NTIMES-1); avg[j] /= (double)(NTIMES-1);
}
double geomean = 1.0;
for (int j = 0; j < 4; j++) {
geomean *= (sizes[j]/min[j]);
}
geomean = pow(geomean, 0.25);
// Display results // Display results
std::string labels[] = {"Copy", "Mul", "Add", "Triad"}; std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
@ -339,6 +468,10 @@ int main(int argc, char *argv[])
<< std::left << std::setw(12) << std::setprecision(5) << avg[j] << std::left << std::setw(12) << std::setprecision(5) << avg[j]
<< std::endl; << std::endl;
} }
std::cout
<< std::left << std::setw(12) << "GEOMEAN"
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
<< std::endl;
// Free host vectors // Free host vectors
free(h_a); free(h_a);

4
runhip.sh Executable file
View File

@ -0,0 +1,4 @@
./gpu-stream-hip
./gpu-stream-hip --groups 64 --groupSize 256
./gpu-stream-hip --float
./gpu-stream-hip --float --groups 64 --groupSize 256