Merge branch 'master' into kernel-dot
Conflicts: main.cpp
This commit is contained in:
commit
f32cf3bad3
@ -71,7 +71,7 @@ void ACCStream<T>::copy()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void ACCStream<T>::mul()
|
void ACCStream<T>::mul()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
|
|
||||||
unsigned int array_size = this->array_size;
|
unsigned int array_size = this->array_size;
|
||||||
T * restrict b = this->b;
|
T * restrict b = this->b;
|
||||||
@ -100,7 +100,7 @@ void ACCStream<T>::add()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void ACCStream<T>::triad()
|
void ACCStream<T>::triad()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
|
|
||||||
unsigned int array_size = this->array_size;
|
unsigned int array_size = this->array_size;
|
||||||
T * restrict a = this->a;
|
T * restrict a = this->a;
|
||||||
|
|||||||
@ -20,7 +20,7 @@ include(CheckIncludeFileCXX)
|
|||||||
include(CheckCXXCompilerFlag)
|
include(CheckCXXCompilerFlag)
|
||||||
|
|
||||||
set(gpu-stream_VERSION_MAJOR 2)
|
set(gpu-stream_VERSION_MAJOR 2)
|
||||||
set(gpu-stream_VERSION_MINOR 0)
|
set(gpu-stream_VERSION_MINOR 1)
|
||||||
|
|
||||||
configure_file(common.h.in common.h)
|
configure_file(common.h.in common.h)
|
||||||
include_directories(${CMAKE_BINARY_DIR})
|
include_directories(${CMAKE_BINARY_DIR})
|
||||||
|
|||||||
@ -123,7 +123,7 @@ void CUDAStream<T>::copy()
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void mul_kernel(T * b, const T * c)
|
__global__ void mul_kernel(T * b, const T * c)
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
b[i] = scalar * c[i];
|
b[i] = scalar * c[i];
|
||||||
}
|
}
|
||||||
@ -156,7 +156,7 @@ void CUDAStream<T>::add()
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void triad_kernel(T * a, const T * b, const T * c)
|
__global__ void triad_kernel(T * a, const T * b, const T * c)
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
a[i] = b[i] + scalar * c[i];
|
a[i] = b[i] + scalar * c[i];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -118,7 +118,7 @@ void HIPStream<T>::copy()
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void mul_kernel(hipLaunchParm lp, T * b, const T * c)
|
__global__ void mul_kernel(hipLaunchParm lp, T * b, const T * c)
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||||
b[i] = scalar * c[i];
|
b[i] = scalar * c[i];
|
||||||
}
|
}
|
||||||
@ -151,7 +151,7 @@ void HIPStream<T>::add()
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void triad_kernel(hipLaunchParm lp, T * a, const T * b, const T * c)
|
__global__ void triad_kernel(hipLaunchParm lp, T * a, const T * b, const T * c)
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||||
a[i] = b[i] + scalar * c[i];
|
a[i] = b[i] + scalar * c[i];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -84,7 +84,7 @@ void KOKKOSStream<T>::mul()
|
|||||||
View<double*, DEVICE> b(*d_b);
|
View<double*, DEVICE> b(*d_b);
|
||||||
View<double*, DEVICE> c(*d_c);
|
View<double*, DEVICE> c(*d_c);
|
||||||
|
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||||
{
|
{
|
||||||
b[index] = scalar*c[index];
|
b[index] = scalar*c[index];
|
||||||
@ -113,7 +113,7 @@ void KOKKOSStream<T>::triad()
|
|||||||
View<double*, DEVICE> b(*d_b);
|
View<double*, DEVICE> b(*d_b);
|
||||||
View<double*, DEVICE> c(*d_c);
|
View<double*, DEVICE> c(*d_c);
|
||||||
|
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||||
{
|
{
|
||||||
a[index] = b[index] + scalar*c[index];
|
a[index] = b[index] + scalar*c[index];
|
||||||
|
|||||||
@ -14,7 +14,7 @@ void getDeviceList(void);
|
|||||||
|
|
||||||
std::string kernels{R"CLC(
|
std::string kernels{R"CLC(
|
||||||
|
|
||||||
constant TYPE scalar = 0.3;
|
constant TYPE scalar = startScalar;
|
||||||
|
|
||||||
kernel void copy(
|
kernel void copy(
|
||||||
global const TYPE * restrict a,
|
global const TYPE * restrict a,
|
||||||
@ -96,14 +96,17 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
|
|||||||
|
|
||||||
// Create program
|
// Create program
|
||||||
cl::Program program(context, kernels);
|
cl::Program program(context, kernels);
|
||||||
|
std::ostringstream args;
|
||||||
|
args << "-DstartScalar=" << startScalar << " ";
|
||||||
if (sizeof(T) == sizeof(double))
|
if (sizeof(T) == sizeof(double))
|
||||||
{
|
{
|
||||||
|
args << "-DTYPE=double";
|
||||||
// Check device can do double
|
// Check device can do double
|
||||||
if (!device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
|
if (!device.getInfo<CL_DEVICE_DOUBLE_FP_CONFIG>())
|
||||||
throw std::runtime_error("Device does not support double precision, please use --float");
|
throw std::runtime_error("Device does not support double precision, please use --float");
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
program.build("-DTYPE=double");
|
program.build(args.str().c_str());
|
||||||
}
|
}
|
||||||
catch (cl::Error& err)
|
catch (cl::Error& err)
|
||||||
{
|
{
|
||||||
@ -115,7 +118,10 @@ OCLStream<T>::OCLStream(const unsigned int ARRAY_SIZE, const int device_index)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (sizeof(T) == sizeof(float))
|
else if (sizeof(T) == sizeof(float))
|
||||||
program.build("-DTYPE=float");
|
{
|
||||||
|
args << "-DTYPE=float";
|
||||||
|
program.build(args.str().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
// Create kernels
|
// Create kernels
|
||||||
copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
|
copy_kernel = new cl::KernelFunctor<cl::Buffer, cl::Buffer>(program, "copy");
|
||||||
|
|||||||
@ -8,6 +8,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
#define CL_HPP_ENABLE_EXCEPTIONS
|
#define CL_HPP_ENABLE_EXCEPTIONS
|
||||||
|
|||||||
@ -62,7 +62,7 @@ void OMP3Stream<T>::copy()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void OMP3Stream<T>::mul()
|
void OMP3Stream<T>::mul()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int i = 0; i < array_size; i++)
|
for (int i = 0; i < array_size; i++)
|
||||||
{
|
{
|
||||||
@ -83,7 +83,7 @@ void OMP3Stream<T>::add()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void OMP3Stream<T>::triad()
|
void OMP3Stream<T>::triad()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int i = 0; i < array_size; i++)
|
for (int i = 0; i < array_size; i++)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -70,7 +70,7 @@ void OMP45Stream<T>::copy()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void OMP45Stream<T>::mul()
|
void OMP45Stream<T>::mul()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
|
|
||||||
unsigned int array_size = this->array_size;
|
unsigned int array_size = this->array_size;
|
||||||
T *b = this->b;
|
T *b = this->b;
|
||||||
@ -99,7 +99,7 @@ void OMP45Stream<T>::add()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void OMP45Stream<T>::triad()
|
void OMP45Stream<T>::triad()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
|
|
||||||
unsigned int array_size = this->array_size;
|
unsigned int array_size = this->array_size;
|
||||||
T *a = this->a;
|
T *a = this->a;
|
||||||
|
|||||||
@ -77,7 +77,7 @@ void RAJAStream<T>::mul()
|
|||||||
{
|
{
|
||||||
T* b = d_b;
|
T* b = d_b;
|
||||||
T* c = d_c;
|
T* c = d_c;
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||||
{
|
{
|
||||||
b[index] = scalar*c[index];
|
b[index] = scalar*c[index];
|
||||||
@ -102,7 +102,7 @@ void RAJAStream<T>::triad()
|
|||||||
T* a = d_a;
|
T* a = d_a;
|
||||||
T* b = d_b;
|
T* b = d_b;
|
||||||
T* c = d_c;
|
T* c = d_c;
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||||
{
|
{
|
||||||
a[index] = b[index] + scalar*c[index];
|
a[index] = b[index] + scalar*c[index];
|
||||||
|
|||||||
@ -78,7 +78,7 @@ void SYCLStream<T>::copy()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void SYCLStream<T>::mul()
|
void SYCLStream<T>::mul()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
queue->submit([&](handler &cgh)
|
queue->submit([&](handler &cgh)
|
||||||
{
|
{
|
||||||
auto kb = d_b->template get_access<access::mode::write>(cgh);
|
auto kb = d_b->template get_access<access::mode::write>(cgh);
|
||||||
@ -110,7 +110,7 @@ void SYCLStream<T>::add()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void SYCLStream<T>::triad()
|
void SYCLStream<T>::triad()
|
||||||
{
|
{
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
queue->submit([&](handler &cgh)
|
queue->submit([&](handler &cgh)
|
||||||
{
|
{
|
||||||
auto ka = d_a->template get_access<access::mode::write>(cgh);
|
auto ka = d_a->template get_access<access::mode::write>(cgh);
|
||||||
|
|||||||
7
Stream.h
7
Stream.h
@ -10,6 +10,12 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
// Array values
|
||||||
|
#define startA (0.1)
|
||||||
|
#define startB (0.2)
|
||||||
|
#define startC (0.0)
|
||||||
|
#define startScalar (0.4)
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
class Stream
|
class Stream
|
||||||
{
|
{
|
||||||
@ -36,3 +42,4 @@ class Stream
|
|||||||
void listDevices(void);
|
void listDevices(void);
|
||||||
std::string getDeviceName(const int);
|
std::string getDeviceName(const int);
|
||||||
std::string getDeviceDriver(const int);
|
std::string getDeviceDriver(const int);
|
||||||
|
|
||||||
|
|||||||
15
main.cpp
15
main.cpp
@ -44,7 +44,6 @@ unsigned int num_times = 100;
|
|||||||
unsigned int deviceIndex = 0;
|
unsigned int deviceIndex = 0;
|
||||||
bool use_float = false;
|
bool use_float = false;
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
|
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
|
||||||
|
|
||||||
@ -85,9 +84,9 @@ void run()
|
|||||||
std::cout << "Precision: double" << std::endl;
|
std::cout << "Precision: double" << std::endl;
|
||||||
|
|
||||||
// Create host vectors
|
// Create host vectors
|
||||||
std::vector<T> a(ARRAY_SIZE, 0.1);
|
std::vector<T> a(ARRAY_SIZE, startA);
|
||||||
std::vector<T> b(ARRAY_SIZE, 0.2);
|
std::vector<T> b(ARRAY_SIZE, startB);
|
||||||
std::vector<T> c(ARRAY_SIZE, 0.0);
|
std::vector<T> c(ARRAY_SIZE, startC);
|
||||||
std::streamsize ss = std::cout.precision();
|
std::streamsize ss = std::cout.precision();
|
||||||
std::cout << std::setprecision(1) << std::fixed
|
std::cout << std::setprecision(1) << std::fixed
|
||||||
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
|
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
|
||||||
@ -232,12 +231,12 @@ template <typename T>
|
|||||||
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
|
void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum)
|
||||||
{
|
{
|
||||||
// Generate correct solution
|
// Generate correct solution
|
||||||
T goldA = 0.1;
|
T goldA = startA;
|
||||||
T goldB = 0.2;
|
T goldB = startB;
|
||||||
T goldC = 0.0;
|
T goldC = startC;
|
||||||
T goldSum = 0.0;
|
T goldSum = 0.0;
|
||||||
|
|
||||||
const T scalar = 0.3;
|
const T scalar = startScalar;
|
||||||
|
|
||||||
for (unsigned int i = 0; i < ntimes; i++)
|
for (unsigned int i = 0; i < ntimes; i++)
|
||||||
{
|
{
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user