From 5ae613519da7eff7150d7383b7fe4dea39a58d51 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:19:31 +0100 Subject: [PATCH 01/11] Change the value of scalar, and specify in a #define --- Stream.h | 7 +++++++ main.cpp | 15 +++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/Stream.h b/Stream.h index 671289e..631e305 100644 --- a/Stream.h +++ b/Stream.h @@ -10,6 +10,12 @@ #include #include +// Array values +#define startA (0.1) +#define startB (0.2) +#define startC (0.0) +#define startScalar (0.4) + template class Stream { @@ -35,3 +41,4 @@ class Stream void listDevices(void); std::string getDeviceName(const int); std::string getDeviceDriver(const int); + diff --git a/main.cpp b/main.cpp index 87ff9b2..6a15aa7 100644 --- a/main.cpp +++ b/main.cpp @@ -44,7 +44,6 @@ unsigned int num_times = 100; unsigned int deviceIndex = 0; bool use_float = false; - template void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c); @@ -85,9 +84,9 @@ void run() std::cout << "Precision: double" << std::endl; // Create host vectors - std::vector a(ARRAY_SIZE, 0.1); - std::vector b(ARRAY_SIZE, 0.2); - std::vector c(ARRAY_SIZE, 0.0); + std::vector a(ARRAY_SIZE, startA); + std::vector b(ARRAY_SIZE, startB); + std::vector c(ARRAY_SIZE, startC); std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" @@ -222,11 +221,11 @@ template void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c) { // Generate correct solution - T goldA = 0.1; - T goldB = 0.2; - T goldC = 0.0; + T goldA = startA; + T goldB = startB; + T goldC = startC; - const T scalar = 0.3; + const T scalar = startScalar; for (unsigned int i = 0; i < ntimes; i++) { From 5b1e67f666c640372cc19e0c7b30bb20f29fcba1 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:19:54 +0100 Subject: [PATCH 02/11] [CUDA] Use new value of scalar --- CUDAStream.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CUDAStream.cu b/CUDAStream.cu index 802bb05..ff2ec41 100644 --- a/CUDAStream.cu +++ b/CUDAStream.cu @@ -118,7 +118,7 @@ void CUDAStream::copy() template __global__ void mul_kernel(T * b, const T * c) { - const T scalar = 0.3; + const T scalar = startScalar; const int i = blockDim.x * blockIdx.x + threadIdx.x; b[i] = scalar * c[i]; } @@ -151,7 +151,7 @@ void CUDAStream::add() template __global__ void triad_kernel(T * a, const T * b, const T * c) { - const T scalar = 0.3; + const T scalar = startScalar; const int i = blockDim.x * blockIdx.x + threadIdx.x; a[i] = b[i] + scalar * c[i]; } From 7a81b63fbf7e5c282106b4df07749954b080a6bd Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:21:47 +0100 Subject: [PATCH 03/11] [OMP3] Use global defined scalar value --- OMP3Stream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OMP3Stream.cpp b/OMP3Stream.cpp index fe8323a..f578c7c 100644 --- a/OMP3Stream.cpp +++ b/OMP3Stream.cpp @@ -62,7 +62,7 @@ void OMP3Stream::copy() template void OMP3Stream::mul() { - const T scalar = 0.3; + const T scalar = startScalar; #pragma omp parallel for for (int i = 0; i < array_size; i++) { @@ -83,7 +83,7 @@ void OMP3Stream::add() template void OMP3Stream::triad() { - const T scalar = 0.3; + const T scalar = startScalar; #pragma omp parallel for for (int i = 0; i < array_size; i++) { From b120acaf87eb59d014f8975cd6b6ee1ab7f912e3 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:23:20 +0100 Subject: [PATCH 04/11] [OMP45] Use global defined scalar value --- OMP45Stream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OMP45Stream.cpp b/OMP45Stream.cpp index f849c39..8f684e2 100644 --- a/OMP45Stream.cpp +++ b/OMP45Stream.cpp @@ -70,7 +70,7 @@ void OMP45Stream::copy() template void OMP45Stream::mul() { - const T scalar = 0.3; + const T scalar = startScalar; unsigned int array_size = this->array_size; T *b = this->b; @@ -99,7 +99,7 @@ void OMP45Stream::add() template void OMP45Stream::triad() { - const T scalar = 0.3; + const T scalar = startScalar; unsigned int array_size = this->array_size; T *a = this->a; From ac6158fa31fb41372750ad05d031df5edc6f0315 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:24:11 +0100 Subject: [PATCH 05/11] [OpenACC] Use global defined scalar value --- ACCStream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ACCStream.cpp b/ACCStream.cpp index d3fbd6a..bd49663 100644 --- a/ACCStream.cpp +++ b/ACCStream.cpp @@ -71,7 +71,7 @@ void ACCStream::copy() template void ACCStream::mul() { - const T scalar = 0.3; + const T scalar = startScalar; unsigned int array_size = this->array_size; T * restrict b = this->b; @@ -100,7 +100,7 @@ void ACCStream::add() template void ACCStream::triad() { - const T scalar = 0.3; + const T scalar = startScalar; unsigned int array_size = this->array_size; T * restrict a = this->a; From d1bebf12d9a94da0db3208530389333e8476e777 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:25:54 +0100 Subject: [PATCH 06/11] [Kokkos] Use global defined scalar value --- KOKKOSStream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/KOKKOSStream.cpp b/KOKKOSStream.cpp index d73f7d5..94ac7ee 100644 --- a/KOKKOSStream.cpp +++ b/KOKKOSStream.cpp @@ -84,7 +84,7 @@ void KOKKOSStream::mul() View b(*d_b); View c(*d_c); - const T scalar = 0.3; + const T scalar = startScalar; parallel_for(array_size, KOKKOS_LAMBDA (const int index) { b[index] = scalar*c[index]; @@ -113,7 +113,7 @@ void KOKKOSStream::triad() View b(*d_b); View c(*d_c); - const T scalar = 0.3; + const T scalar = startScalar; parallel_for(array_size, KOKKOS_LAMBDA (const int index) { a[index] = b[index] + scalar*c[index]; From b54d94b82d8de9c9d4271bb6ee3e5c1bdb8563d5 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:30:55 +0100 Subject: [PATCH 07/11] [RAJA] Use global defined scalar value --- RAJAStream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RAJAStream.cpp b/RAJAStream.cpp index e418f09..0ee2390 100644 --- a/RAJAStream.cpp +++ b/RAJAStream.cpp @@ -77,7 +77,7 @@ void RAJAStream::mul() { T* b = d_b; T* c = d_c; - const T scalar = 0.3; + const T scalar = startScalar; forall(index_set, [=] RAJA_DEVICE (int index) { b[index] = scalar*c[index]; @@ -102,7 +102,7 @@ void RAJAStream::triad() T* a = d_a; T* b = d_b; T* c = d_c; - const T scalar = 0.3; + const T scalar = startScalar; forall(index_set, [=] RAJA_DEVICE (int index) { a[index] = b[index] + scalar*c[index]; From 47128d47c037fd9d575e8112ba3c51146530f040 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:37:43 +0100 Subject: [PATCH 08/11] [SYCL] Use global defined scalar value --- SYCLStream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index d4a2fd0..215f161 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -78,7 +78,7 @@ void SYCLStream::copy() template void SYCLStream::mul() { - const T scalar = 0.3; + const T scalar = startScalar; queue->submit([&](handler &cgh) { auto kb = d_b->template get_access(cgh); @@ -110,7 +110,7 @@ void SYCLStream::add() template void SYCLStream::triad() { - const T scalar = 0.3; + const T scalar = startScalar; queue->submit([&](handler &cgh) { auto ka = d_a->template get_access(cgh); From ce5152fefd0f6a75ad4251771622dd59bd3a8e57 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:45:07 +0100 Subject: [PATCH 09/11] [HIP] Use global defined scalar value --- HIPStream.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HIPStream.cu b/HIPStream.cu index ec34955..34ecfb6 100644 --- a/HIPStream.cu +++ b/HIPStream.cu @@ -118,7 +118,7 @@ void HIPStream::copy() template __global__ void mul_kernel(hipLaunchParm lp, T * b, const T * c) { - const T scalar = 0.3; + const T scalar = startScalar; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; b[i] = scalar * c[i]; } @@ -151,7 +151,7 @@ void HIPStream::add() template __global__ void triad_kernel(hipLaunchParm lp, T * a, const T * b, const T * c) { - const T scalar = 0.3; + const T scalar = startScalar; const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; a[i] = b[i] + scalar * c[i]; } From 0bed614734effe834d1d3654ebc63345c9fc2458 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:51:47 +0100 Subject: [PATCH 10/11] [OpenCL] Use global defined scalar value --- OCLStream.cpp | 12 +++++++++--- OCLStream.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/OCLStream.cpp b/OCLStream.cpp index 0ed4b8e..2a1e5ee 100644 --- a/OCLStream.cpp +++ b/OCLStream.cpp @@ -14,7 +14,7 @@ void getDeviceList(void); std::string kernels{R"CLC( - constant TYPE scalar = 0.3; + constant TYPE scalar = startScalar; kernel void copy( global const TYPE * restrict a, @@ -73,14 +73,17 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) // Create program cl::Program program(context, kernels); + std::ostringstream args; + args << "-DstartScalar=" << startScalar << " "; if (sizeof(T) == sizeof(double)) { + args << "-DTYPE=double"; // Check device can do double if (!device.getInfo()) throw std::runtime_error("Device does not support double precision, please use --float"); try { - program.build("-DTYPE=double"); + program.build(args.str().c_str()); } catch (cl::Error& err) { @@ -92,7 +95,10 @@ OCLStream::OCLStream(const unsigned int ARRAY_SIZE, const int device_index) } } else if (sizeof(T) == sizeof(float)) - program.build("-DTYPE=float"); + { + args << "-DTYPE=float"; + program.build(args.str().c_str()); + } // Create kernels copy_kernel = new cl::KernelFunctor(program, "copy"); diff --git a/OCLStream.h b/OCLStream.h index cb48da5..54abaa3 100644 --- a/OCLStream.h +++ b/OCLStream.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #define CL_HPP_ENABLE_EXCEPTIONS From 963f3abfa085a8d4bce6757013c24919843ecdae Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Mon, 24 Oct 2016 13:52:03 +0100 Subject: [PATCH 11/11] Version bump --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f80d762..5d01274 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ include(CheckIncludeFileCXX) include(CheckCXXCompilerFlag) set(gpu-stream_VERSION_MAJOR 2) -set(gpu-stream_VERSION_MINOR 0) +set(gpu-stream_VERSION_MINOR 1) configure_file(common.h.in common.h) include_directories(${CMAKE_BINARY_DIR})