Improved GPU-STREAM benchmark for HIP version:
1. Add optional looper kernels to take command line input for the number of groups and groupSize 2. Add GEOMEAN value calculation of the kernels 3. Instructions on configure HIP environment in the README.md 4. Add results for HIP on FIJI Nano, TITAN X; CUDA on TITAN X 5. Run script to optionally run HIP version with groups and groupSize options
This commit is contained in:
parent
fdeb20601f
commit
11053798ff
@ -15,7 +15,9 @@ Run the OpenCL version with `./gpu-stream-ocl` and the CUDA version with `./gpu-
|
|||||||
|
|
||||||
For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
|
For HIP version, follow the instructions on the following blog to properly install ROCK and ROCR drivers:
|
||||||
http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
|
http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/
|
||||||
Clone from the HIP repository in the following link:
|
Install the HCC compiler:
|
||||||
|
https://bitbucket.org/multicoreware/hcc/wiki/Home
|
||||||
|
Install HIP:
|
||||||
https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
|
https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
|
||||||
|
|
||||||
Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
|
Build the HIP binaries with make gpu-stream-hip, run it with './gpu-stream-hip'
|
||||||
|
|||||||
50
common.cpp
50
common.cpp
@ -39,10 +39,13 @@
|
|||||||
// Default array size 50 * 2^20 (50*8 Mebibytes double precision)
|
// Default array size 50 * 2^20 (50*8 Mebibytes double precision)
|
||||||
// Use binary powers of two so divides 1024
|
// Use binary powers of two so divides 1024
|
||||||
unsigned int ARRAY_SIZE = 52428800;
|
unsigned int ARRAY_SIZE = 52428800;
|
||||||
|
size_t ARRAY_PAD_BYTES = 0;
|
||||||
|
|
||||||
unsigned int NTIMES = 10;
|
unsigned int NTIMES = 10;
|
||||||
|
|
||||||
bool useFloat = false;
|
bool useFloat = false;
|
||||||
|
unsigned int groups = 0;
|
||||||
|
unsigned int groupSize = 1024;
|
||||||
|
|
||||||
unsigned int deviceIndex = 0;
|
unsigned int deviceIndex = 0;
|
||||||
|
|
||||||
@ -53,6 +56,25 @@ int parseUInt(const char *str, unsigned int *output)
|
|||||||
return !strlen(next);
|
return !strlen(next);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int parseSize(const char *str, size_t *output)
|
||||||
|
{
|
||||||
|
char *next;
|
||||||
|
*output = strtoull(str, &next, 0);
|
||||||
|
int l = strlen(str);
|
||||||
|
if (l) {
|
||||||
|
char c = str[l-1]; // last char.
|
||||||
|
if ((c == 'k') || (c == 'K')) {
|
||||||
|
*output *= 1024;
|
||||||
|
}
|
||||||
|
if ((c == 'm') || (c == 'M')) {
|
||||||
|
*output *= (1024*1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return !strlen(next);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void parseArguments(int argc, char *argv[])
|
void parseArguments(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
for (int i = 1; i < argc; i++)
|
for (int i = 1; i < argc; i++)
|
||||||
@ -86,6 +108,31 @@ void parseArguments(int argc, char *argv[])
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (!strcmp(argv[i], "--groups"))
|
||||||
|
{
|
||||||
|
if (++i >= argc || !parseUInt(argv[i], &groups))
|
||||||
|
{
|
||||||
|
std::cout << "Invalid group number" << std::endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--groupSize"))
|
||||||
|
{
|
||||||
|
if (++i >= argc || !parseUInt(argv[i], &groupSize))
|
||||||
|
{
|
||||||
|
std::cout << "Invalid group size" << std::endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!strcmp(argv[i], "--pad"))
|
||||||
|
{
|
||||||
|
if (++i >= argc || !parseSize(argv[i], &ARRAY_PAD_BYTES))
|
||||||
|
{
|
||||||
|
std::cout << "Invalid size" << std::endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
else if (!strcmp(argv[i], "--float"))
|
else if (!strcmp(argv[i], "--float"))
|
||||||
{
|
{
|
||||||
useFloat = true;
|
useFloat = true;
|
||||||
@ -101,6 +148,9 @@ void parseArguments(int argc, char *argv[])
|
|||||||
std::cout << " --device INDEX Select device at INDEX" << std::endl;
|
std::cout << " --device INDEX Select device at INDEX" << std::endl;
|
||||||
std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl;
|
std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl;
|
||||||
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
|
std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl;
|
||||||
|
std::cout << " --groups Set number of groups to launch - each work-item proceses multiple array items" << std::endl;
|
||||||
|
std::cout << " --groupSize Set size of each group (default 1024)" << std::endl;
|
||||||
|
std::cout << " --pad Add additional array padding. Can use trailing K (KB) or M (MB)" << std::endl;
|
||||||
std::cout << " --float Use floats (rather than doubles)" << std::endl;
|
std::cout << " --float Use floats (rather than doubles)" << std::endl;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
exit(0);
|
exit(0);
|
||||||
|
|||||||
3
common.h
3
common.h
@ -48,8 +48,11 @@ extern void parseArguments(int argc, char *argv[]);
|
|||||||
extern void listDevices(void);
|
extern void listDevices(void);
|
||||||
|
|
||||||
extern unsigned int ARRAY_SIZE;
|
extern unsigned int ARRAY_SIZE;
|
||||||
|
extern size_t ARRAY_PAD_BYTES;
|
||||||
extern unsigned int NTIMES;
|
extern unsigned int NTIMES;
|
||||||
|
|
||||||
|
extern unsigned int groups;
|
||||||
|
extern unsigned int groupSize;
|
||||||
extern bool useFloat;
|
extern bool useFloat;
|
||||||
|
|
||||||
extern unsigned int deviceIndex;
|
extern unsigned int deviceIndex;
|
||||||
|
|||||||
213
hip-stream.cpp
213
hip-stream.cpp
@ -43,7 +43,7 @@
|
|||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include <cuda.h>
|
//#include <cuda.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
std::string getDeviceName(int device);
|
std::string getDeviceName(int device);
|
||||||
@ -63,15 +63,75 @@ void check_cuda_error(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// looper function place more work inside each work item.
|
||||||
|
// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
|
||||||
|
template <typename T, int CLUMP_SIZE>
|
||||||
|
__global__ void
|
||||||
|
copy_looper(hipLaunchParm lp, const T * a, T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x)*CLUMP_SIZE;
|
||||||
|
int stride = hipBlockDim_x * hipGridDim_x * CLUMP_SIZE;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
c[i] = a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void copy(hipLaunchParm lp, const T * a, T * c)
|
__global__ void
|
||||||
|
mul_looper(hipLaunchParm lp, T * b, const T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||||
|
int stride = hipBlockDim_x * hipGridDim_x;
|
||||||
|
const T scalar = 3.0;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
b[i] = scalar * c[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void
|
||||||
|
add_looper(hipLaunchParm lp, const T * a, const T * b, T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||||
|
int stride = hipBlockDim_x * hipGridDim_x;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
c[i] = a[i] + b[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void
|
||||||
|
triad_looper(hipLaunchParm lp, T * a, const T * b, const T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||||
|
int stride = hipBlockDim_x * hipGridDim_x;
|
||||||
|
const T scalar = 3.0;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
a[i] = b[i] + scalar * c[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void
|
||||||
|
copy(hipLaunchParm lp, const T * a, T * c)
|
||||||
{
|
{
|
||||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||||
c[i] = a[i];
|
c[i] = a[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void mul(hipLaunchParm lp, T * b, const T * c)
|
__global__ void
|
||||||
|
mul(hipLaunchParm lp, T * b, const T * c)
|
||||||
{
|
{
|
||||||
const T scalar = 3.0;
|
const T scalar = 3.0;
|
||||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||||
@ -79,14 +139,16 @@ __global__ void mul(hipLaunchParm lp, T * b, const T * c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void add(hipLaunchParm lp, const T * a, const T * b, T * c)
|
__global__ void
|
||||||
|
add(hipLaunchParm lp, const T * a, const T * b, T * c)
|
||||||
{
|
{
|
||||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||||
c[i] = a[i] + b[i];
|
c[i] = a[i] + b[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void triad(hipLaunchParm lp, T * a, const T * b, const T * c)
|
__global__ void
|
||||||
|
triad(hipLaunchParm lp, T * a, const T * b, const T * c)
|
||||||
{
|
{
|
||||||
const T scalar = 3.0;
|
const T scalar = 3.0;
|
||||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||||
@ -100,13 +162,27 @@ int main(int argc, char *argv[])
|
|||||||
std::cout
|
std::cout
|
||||||
<< "GPU-STREAM" << std::endl
|
<< "GPU-STREAM" << std::endl
|
||||||
<< "Version: " << VERSION_STRING << std::endl
|
<< "Version: " << VERSION_STRING << std::endl
|
||||||
<< "Implementation: CUDA" << std::endl;
|
<< "Implementation: HIP" << std::endl;
|
||||||
|
|
||||||
parseArguments(argc, argv);
|
parseArguments(argc, argv);
|
||||||
|
|
||||||
if (NTIMES < 2)
|
if (NTIMES < 2)
|
||||||
throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
|
throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
|
||||||
|
|
||||||
|
// Config grid size and group size for kernel launching
|
||||||
|
int gridSize;
|
||||||
|
if (groups) {
|
||||||
|
gridSize = groups * groupSize;
|
||||||
|
} else {
|
||||||
|
gridSize = ARRAY_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
|
||||||
|
std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
|
||||||
|
std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
|
||||||
|
std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
|
||||||
|
if (groups) std::cout << "Using looper kernels:" << std::endl;
|
||||||
|
|
||||||
std::cout << "Precision: ";
|
std::cout << "Precision: ";
|
||||||
if (useFloat) std::cout << "float";
|
if (useFloat) std::cout << "float";
|
||||||
else std::cout << "double";
|
else std::cout << "double";
|
||||||
@ -144,9 +220,10 @@ int main(int argc, char *argv[])
|
|||||||
std::cout << std::setprecision(1) << std::fixed
|
std::cout << std::setprecision(1) << std::fixed
|
||||||
<< "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
|
<< "Array size: " << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
|
||||||
<< " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
|
<< " (=" << ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
|
||||||
|
<< " " << ARRAY_PAD_BYTES << " bytes padding"
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
std::cout << "Total size: " << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0 << " MB"
|
std::cout << "Total size: " << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0 << " MB"
|
||||||
<< " (=" << 3.0*ARRAY_SIZE*DATATYPE_SIZE/1024.0/1024.0/1024.0 << " GB)"
|
<< " (=" << 3.0*(ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES) /1024.0/1024.0/1024.0 << " GB)"
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
|
|
||||||
// Reset precision
|
// Reset precision
|
||||||
@ -161,24 +238,31 @@ int main(int argc, char *argv[])
|
|||||||
hipSetDevice(deviceIndex);
|
hipSetDevice(deviceIndex);
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
|
|
||||||
// Print out device name
|
|
||||||
std::cout << "Using CUDA device " << getDeviceName(deviceIndex) << std::endl;
|
|
||||||
|
|
||||||
// Print out device CUDA driver version
|
|
||||||
std::cout << "Driver: " << getDriver() << std::endl;
|
|
||||||
|
|
||||||
// Check buffers fit on the device
|
|
||||||
hipDeviceProp_t props;
|
hipDeviceProp_t props;
|
||||||
hipGetDeviceProperties(&props, deviceIndex);
|
hipGetDeviceProperties(&props, deviceIndex);
|
||||||
|
|
||||||
|
// Print out device name
|
||||||
|
std::cout << "Using HIP device " << getDeviceName(deviceIndex) << " (compute_units=" << props.multiProcessorCount << ")" << std::endl;
|
||||||
|
|
||||||
|
// Print out device HIP driver version
|
||||||
|
std::cout << "Driver: " << getDriver() << std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Check buffers fit on the device
|
||||||
if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
|
if (props.totalGlobalMem < 3*DATATYPE_SIZE*ARRAY_SIZE)
|
||||||
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
throw std::runtime_error("Device does not have enough memory for all 3 buffers");
|
||||||
|
|
||||||
// Create host vectors
|
//int cus = props.multiProcessorCount;
|
||||||
void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE);
|
|
||||||
void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE);
|
|
||||||
void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE);
|
|
||||||
|
|
||||||
// Initilise arrays
|
// Create host vectors
|
||||||
|
void * h_a = malloc(ARRAY_SIZE*DATATYPE_SIZE );
|
||||||
|
void * h_b = malloc(ARRAY_SIZE*DATATYPE_SIZE );
|
||||||
|
void * h_c = malloc(ARRAY_SIZE*DATATYPE_SIZE );
|
||||||
|
|
||||||
|
// Initialise arrays
|
||||||
for (unsigned int i = 0; i < ARRAY_SIZE; i++)
|
for (unsigned int i = 0; i < ARRAY_SIZE; i++)
|
||||||
{
|
{
|
||||||
if (useFloat)
|
if (useFloat)
|
||||||
@ -196,12 +280,14 @@ int main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create device buffers
|
// Create device buffers
|
||||||
void * d_a, * d_b, *d_c;
|
char * d_a, * d_b, *d_c;
|
||||||
hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE);
|
hipMalloc(&d_a, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE);
|
hipMalloc(&d_b, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
|
||||||
|
d_b += ARRAY_PAD_BYTES;
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE);
|
hipMalloc(&d_c, ARRAY_SIZE*DATATYPE_SIZE + ARRAY_PAD_BYTES);
|
||||||
|
d_c += ARRAY_PAD_BYTES;
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
|
|
||||||
// Copy host memory to device
|
// Copy host memory to device
|
||||||
@ -212,10 +298,17 @@ int main(int argc, char *argv[])
|
|||||||
hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
|
hipMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, hipMemcpyHostToDevice);
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
|
|
||||||
|
|
||||||
|
std::cout << "d_a=" << (void*)d_a << std::endl;
|
||||||
|
std::cout << "d_b=" << (void*)d_b << std::endl;
|
||||||
|
std::cout << "d_c=" << (void*)d_c << std::endl;
|
||||||
|
|
||||||
// Make sure the copies are finished
|
// Make sure the copies are finished
|
||||||
hipDeviceSynchronize();
|
hipDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// List of times
|
// List of times
|
||||||
std::vector< std::vector<double> > timings;
|
std::vector< std::vector<double> > timings;
|
||||||
|
|
||||||
@ -227,10 +320,17 @@ int main(int argc, char *argv[])
|
|||||||
{
|
{
|
||||||
std::vector<double> times;
|
std::vector<double> times;
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
if (useFloat)
|
if (groups) {
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_c);
|
if (useFloat)
|
||||||
else
|
hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<float,1>), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c, ARRAY_SIZE);
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_c);
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(copy_looper<double,1>), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
|
if (useFloat)
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_c);
|
||||||
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(copy), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_c);
|
||||||
|
}
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
hipDeviceSynchronize();
|
hipDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -239,10 +339,17 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
if (useFloat)
|
if (groups) {
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_b, (float*)d_c);
|
if (useFloat)
|
||||||
else
|
hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c, ARRAY_SIZE);
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_b, (double*)d_c);
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(mul_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
|
if (useFloat)
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_b, (float*)d_c);
|
||||||
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(mul), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_b, (double*)d_c);
|
||||||
|
}
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
hipDeviceSynchronize();
|
hipDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -251,10 +358,17 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
if (useFloat)
|
if (groups) {
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
|
if (useFloat)
|
||||||
else
|
hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(add_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
|
if (useFloat)
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
|
||||||
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(add), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
|
||||||
|
}
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
hipDeviceSynchronize();
|
hipDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -263,10 +377,18 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
if (useFloat)
|
if (groups) {
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
|
if (useFloat)
|
||||||
else
|
hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
|
||||||
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/1024), dim3(1024), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(triad_looper), dim3(gridSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
|
if (useFloat)
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (float*)d_a, (float*)d_b, (float*)d_c);
|
||||||
|
else
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(triad), dim3(ARRAY_SIZE/groupSize), dim3(groupSize), 0, 0, (double*)d_a, (double*)d_b, (double*)d_c);
|
||||||
|
}
|
||||||
|
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
hipDeviceSynchronize();
|
hipDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -316,8 +438,15 @@ int main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = 0; j < 4; j++)
|
for (int j = 0; j < 4; j++) {
|
||||||
avg[j] /= (double)(NTIMES-1);
|
avg[j] /= (double)(NTIMES-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
double geomean = 1.0;
|
||||||
|
for (int j = 0; j < 4; j++) {
|
||||||
|
geomean *= (sizes[j]/min[j]);
|
||||||
|
}
|
||||||
|
geomean = pow(geomean, 0.25);
|
||||||
|
|
||||||
// Display results
|
// Display results
|
||||||
std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
|
std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
|
||||||
@ -339,6 +468,10 @@ int main(int argc, char *argv[])
|
|||||||
<< std::left << std::setw(12) << std::setprecision(5) << avg[j]
|
<< std::left << std::setw(12) << std::setprecision(5) << avg[j]
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
|
std::cout
|
||||||
|
<< std::left << std::setw(12) << "GEOMEAN"
|
||||||
|
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
// Free host vectors
|
// Free host vectors
|
||||||
free(h_a);
|
free(h_a);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user