Add looper optimization for cuda-stream.cu, remove result files
This commit is contained in:
parent
11053798ff
commit
207701219a
110
cuda-stream.cu
110
cuda-stream.cu
@ -62,6 +62,59 @@ void check_cuda_error(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// looper function place more work inside each work item.
|
||||||
|
// Goal is reduce the dispatch overhead for each group, and also give more controlover the order of memory operations
|
||||||
|
template <typename T, int CLUMP_SIZE>
|
||||||
|
__global__ void
|
||||||
|
copy_looper(const T * a, T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = (blockDim.x * blockIdx.x + threadIdx.x)*CLUMP_SIZE;
|
||||||
|
int stride = blockDim.x * gridDim.x * CLUMP_SIZE;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
c[i] = a[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void
|
||||||
|
mul_looper(T * b, const T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
const T scalar = 3.0;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
b[i] = scalar * c[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void
|
||||||
|
add_looper(const T * a, const T * b, T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
c[i] = a[i] + b[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void
|
||||||
|
triad_looper( T * a, const T * b, const T * c, int ARRAY_SIZE)
|
||||||
|
{
|
||||||
|
int offset = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
const T scalar = 3.0;
|
||||||
|
|
||||||
|
for (int i=offset; i<ARRAY_SIZE; i+=stride) {
|
||||||
|
a[i] = b[i] + scalar * c[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void copy(const T * a, T * c)
|
__global__ void copy(const T * a, T * c)
|
||||||
{
|
{
|
||||||
@ -106,6 +159,20 @@ int main(int argc, char *argv[])
|
|||||||
if (NTIMES < 2)
|
if (NTIMES < 2)
|
||||||
throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
|
throw std::runtime_error("Chosen number of times is invalid, must be >= 2");
|
||||||
|
|
||||||
|
// Config grid size and group size for kernel launching
|
||||||
|
int gridSize;
|
||||||
|
if (groups) {
|
||||||
|
gridSize = groups * groupSize;
|
||||||
|
} else {
|
||||||
|
gridSize = ARRAY_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
float operationsPerWorkitem = (float)ARRAY_SIZE / (float)gridSize;
|
||||||
|
std::cout << "GridSize: " << gridSize << " work-items" << std::endl;
|
||||||
|
std::cout << "GroupSize: " << groupSize << " work-items" << std::endl;
|
||||||
|
std::cout << "Operations/Work-item: " << operationsPerWorkitem << std::endl;
|
||||||
|
if (groups) std::cout << "Using looper kernels:" << std::endl;
|
||||||
|
|
||||||
std::cout << "Precision: ";
|
std::cout << "Precision: ";
|
||||||
if (useFloat) std::cout << "float";
|
if (useFloat) std::cout << "float";
|
||||||
else std::cout << "double";
|
else std::cout << "double";
|
||||||
@ -211,6 +278,10 @@ int main(int argc, char *argv[])
|
|||||||
cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
|
cudaMemcpy(d_c, h_c, ARRAY_SIZE*DATATYPE_SIZE, cudaMemcpyHostToDevice);
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
|
|
||||||
|
std::cout << "d_a=" << (void*)d_a << std::endl;
|
||||||
|
std::cout << "d_b=" << (void*)d_b << std::endl;
|
||||||
|
std::cout << "d_c=" << (void*)d_c << std::endl;
|
||||||
|
|
||||||
// Make sure the copies are finished
|
// Make sure the copies are finished
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -226,10 +297,18 @@ int main(int argc, char *argv[])
|
|||||||
{
|
{
|
||||||
std::vector<double> times;
|
std::vector<double> times;
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
if (groups) {
|
||||||
|
if (useFloat)
|
||||||
|
copy_looper<float,1><<<gridSize,groupSize>>>((float*)d_a, (float*)d_c, ARRAY_SIZE);
|
||||||
|
else
|
||||||
|
copy_looper<double,1><<<gridSize,groupSize>>>((double*)d_a, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
if (useFloat)
|
if (useFloat)
|
||||||
copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
|
copy<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_c);
|
||||||
else
|
else
|
||||||
copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
|
copy<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_c);
|
||||||
|
}
|
||||||
|
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -238,10 +317,17 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
if (groups) {
|
||||||
|
if (useFloat)
|
||||||
|
mul_looper<float><<<gridSize,groupSize>>>((float*)d_b, (float*)d_c, ARRAY_SIZE);
|
||||||
|
else
|
||||||
|
mul_looper<double><<<gridSize,groupSize>>>((double*)d_b, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
if (useFloat)
|
if (useFloat)
|
||||||
mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
|
mul<<<ARRAY_SIZE/1024, 1024>>>((float*)d_b, (float*)d_c);
|
||||||
else
|
else
|
||||||
mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
|
mul<<<ARRAY_SIZE/1024, 1024>>>((double*)d_b, (double*)d_c);
|
||||||
|
}
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -250,10 +336,17 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
if (groups) {
|
||||||
|
if (useFloat)
|
||||||
|
add_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
|
||||||
|
else
|
||||||
|
add_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
if (useFloat)
|
if (useFloat)
|
||||||
add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
|
add<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
|
||||||
else
|
else
|
||||||
add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
|
add<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
|
||||||
|
}
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -262,10 +355,17 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
if (groups) {
|
||||||
|
if (useFloat)
|
||||||
|
triad_looper<float><<<gridSize,groupSize>>>((float*)d_a, (float*)d_b, (float*)d_c, ARRAY_SIZE);
|
||||||
|
else
|
||||||
|
triad_looper<double><<<gridSize,groupSize>>>((double*)d_a, (double*)d_b, (double*)d_c, ARRAY_SIZE);
|
||||||
|
} else {
|
||||||
if (useFloat)
|
if (useFloat)
|
||||||
triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
|
triad<<<ARRAY_SIZE/1024, 1024>>>((float*)d_a, (float*)d_b, (float*)d_c);
|
||||||
else
|
else
|
||||||
triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
|
triad<<<ARRAY_SIZE/1024, 1024>>>((double*)d_a, (double*)d_b, (double*)d_c);
|
||||||
|
}
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
check_cuda_error();
|
check_cuda_error();
|
||||||
@ -318,6 +418,12 @@ int main(int argc, char *argv[])
|
|||||||
for (int j = 0; j < 4; j++)
|
for (int j = 0; j < 4; j++)
|
||||||
avg[j] /= (double)(NTIMES-1);
|
avg[j] /= (double)(NTIMES-1);
|
||||||
|
|
||||||
|
double geomean = 1.0;
|
||||||
|
for (int j = 0; j < 4; j++) {
|
||||||
|
geomean *= (sizes[j]/min[j]);
|
||||||
|
}
|
||||||
|
geomean = pow(geomean, 0.25);
|
||||||
|
|
||||||
// Display results
|
// Display results
|
||||||
std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
|
std::string labels[] = {"Copy", "Mul", "Add", "Triad"};
|
||||||
std::cout
|
std::cout
|
||||||
@ -338,6 +444,10 @@ int main(int argc, char *argv[])
|
|||||||
<< std::left << std::setw(12) << std::setprecision(5) << avg[j]
|
<< std::left << std::setw(12) << std::setprecision(5) << avg[j]
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
}
|
}
|
||||||
|
std::cout
|
||||||
|
<< std::left << std::setw(12) << "GEOMEAN"
|
||||||
|
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-06 * geomean
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
// Free host vectors
|
// Free host vectors
|
||||||
free(h_a);
|
free(h_a);
|
||||||
|
|||||||
@ -1,15 +0,0 @@
|
|||||||
GPU-STREAM
|
|
||||||
Version: 1.0
|
|
||||||
Implementation: CUDA
|
|
||||||
Precision: double
|
|
||||||
|
|
||||||
Running kernels 10 times
|
|
||||||
Array size: 400.0 MB (=0.4 GB)
|
|
||||||
Total size: 1200.0 MB (=1.2 GB)
|
|
||||||
Using CUDA device GeForce GTX TITAN X
|
|
||||||
Driver: 7050
|
|
||||||
Function MBytes/sec Min (sec) Max Average
|
|
||||||
Copy 263155.587 0.00319 0.00319 0.00319
|
|
||||||
Mul 262943.430 0.00319 0.00319 0.00319
|
|
||||||
Add 268710.444 0.00468 0.00469 0.00469
|
|
||||||
Triad 268957.305 0.00468 0.00469 0.00468
|
|
||||||
@ -1,15 +0,0 @@
|
|||||||
GPU-STREAM
|
|
||||||
Version: 1.0
|
|
||||||
Implementation: CUDA
|
|
||||||
Precision: double
|
|
||||||
|
|
||||||
Running kernels 10 times
|
|
||||||
Array size: 400.0 MB (=0.4 GB)
|
|
||||||
Total size: 1200.0 MB (=1.2 GB)
|
|
||||||
Using CUDA device Fiji
|
|
||||||
Driver: 4
|
|
||||||
Function MBytes/sec Min (sec) Max Average
|
|
||||||
Copy 375822.410 0.00223 0.00225 0.00224
|
|
||||||
Mul 375086.879 0.00224 0.00227 0.00224
|
|
||||||
Add 425650.718 0.00296 0.00298 0.00297
|
|
||||||
Triad 424710.113 0.00296 0.00298 0.00298
|
|
||||||
@ -1,22 +0,0 @@
|
|||||||
GPU-STREAM
|
|
||||||
Version: 1.0
|
|
||||||
Implementation: HIP
|
|
||||||
GridSize: 52428800 work-items
|
|
||||||
GroupSize: 1024 work-items
|
|
||||||
Operations/Work-item: 1
|
|
||||||
Precision: double
|
|
||||||
|
|
||||||
Running kernels 10 times
|
|
||||||
Array size: 400.0 MB (=0.4 GB) 0 bytes padding
|
|
||||||
Total size: 1200.0 MB (=1.2 GB)
|
|
||||||
Using HIP device GeForce GTX TITAN X (compute_units=24)
|
|
||||||
Driver: 4
|
|
||||||
d_a=0x1306d80000
|
|
||||||
d_b=0x131fd80000
|
|
||||||
d_c=0x1338d80000
|
|
||||||
Function MBytes/sec Min (sec) Max Average
|
|
||||||
Copy 263042.207 0.00319 0.00320 0.00319
|
|
||||||
Mul 262972.033 0.00319 0.00320 0.00319
|
|
||||||
Add 268732.653 0.00468 0.00469 0.00469
|
|
||||||
Triad 268706.197 0.00468 0.00469 0.00469
|
|
||||||
GEOMEAN 265847.929
|
|
||||||
4
runcuda.sh
Executable file
4
runcuda.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
./gpu-stream-cuda
|
||||||
|
./gpu-stream-cuda --groups 64 --groupSize 256
|
||||||
|
./gpu-stream-cuda --float
|
||||||
|
./gpu-stream-cuda --float --groups 64 --groupSize 256
|
||||||
Loading…
Reference in New Issue
Block a user