Merging in changes from trunk

This commit is contained in:
Matthew Martineau 2016-05-05 17:23:47 +01:00
commit 0a738efa54
8 changed files with 541 additions and 3 deletions

136
ACCStream.cpp Normal file
View File

@ -0,0 +1,136 @@
#include "ACCStream.h"
template <class T>
ACCStream<T>::ACCStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
{
acc_set_device_num(device, acc_device_nvidia);
array_size = ARRAY_SIZE;
// Set up data region on device
this->a = a;
this->b = b;
this->c = c;
#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
ACCStream<T>::~ACCStream()
{
// End data region on device
unsigned int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc exit data delete(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void ACCStream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc update device(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
{}
}
template <class T>
void ACCStream<T>::copy()
{
unsigned int array_size = this->array_size;
T *a = this->a;
T *c = this->c;
#pragma acc kernels present(a[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
c[i] = a[i];
}
}
template <class T>
void ACCStream<T>::mul()
{
const T scalar = 3.0;
unsigned int array_size = this->array_size;
T *b = this->b;
T *c = this->c;
#pragma acc kernels present(b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
b[i] = scalar * c[i];
}
}
template <class T>
void ACCStream<T>::add()
{
unsigned int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
c[i] = a[i] + b[i];
}
}
template <class T>
void ACCStream<T>::triad()
{
const T scalar = 3.0;
unsigned int array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc kernels present(a[0:array_size], b[0:array_size], c[0:array_size]) wait
for (int i = 0; i < array_size; i++)
{
a[i] = b[i] + scalar * c[i];
}
}
void listDevices(void)
{
// Get number of devices
int count = acc_get_num_devices(acc_device_nvidia);
// Print device list
if (count == 0)
{
std::cerr << "No devices found." << std::endl;
}
else
{
std::cout << "There are " << count << " devices." << std::endl;
}
}
std::string getDeviceName(const int)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class ACCStream<float>;
template class ACCStream<double>;

39
ACCStream.h Normal file
View File

@ -0,0 +1,39 @@
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#include <openacc.h>
#define IMPLEMENTATION_STRING "OpenACC"
template <class T>
class ACCStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers
T *a;
T *b;
T *c;
public:
ACCStream(const unsigned int, T*, T*, T*, int);
~ACCStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

View File

@ -9,12 +9,11 @@ project(gpu-stream)
set(gpu-stream_VERSION_MAJOR 2)
set(gpu-stream_VERSION_MINOR 0)
list(APPEND CMAKE_CXX_FLAGS --std=c++11)
configure_file(common.h.in common.h)
find_package(CUDA 7.0 QUIET)
if (${CUDA_FOUND})
list(APPEND CUDA_NVCC_FLAGS --std=c++11)
cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
else (${CUDA_FOUND})
@ -30,3 +29,20 @@ else (${OpenCL_FOUND})
message("Skipping OpenCL...")
endif (${OpenCL_FOUND})
# TODO: Find OpenACC implementations somehow
if (true)
add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
target_compile_definitions(gpu-stream-acc PUBLIC ACC)
target_compile_options(gpu-stream-acc PUBLIC "-hstd=c++11")
else ()
message("Skipping OpenACC...")
endif ()
# TODO: Find SYCL implementations somehow
if (true)
add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
else ()
message("Skipping SYCL...")
endif ()

106
OMP3Stream.cpp Normal file
View File

@ -0,0 +1,106 @@
#include "OMP3Stream.h"
template <class T>
OMP3Stream<T>::OMP3Stream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c)
{
array_size = ARRAY_SIZE;
this->a = (T*)malloc(sizeof(T)*array_size);
this->b = (T*)malloc(sizeof(T)*array_size);
this->c = (T*)malloc(sizeof(T)*array_size);
}
template <class T>
OMP3Stream<T>::~OMP3Stream()
{
free(a);
free(b);
free(c);
}
template <class T>
void OMP3Stream<T>::write_arrays(const std::vector<T>& h_a, const std::vector<T>& h_b, const std::vector<T>& h_c)
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
a[i] = h_a[i];
b[i] = h_b[i];
c[i] = h_c[i];
}
}
template <class T>
void OMP3Stream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
h_a[i] = a[i];
h_b[i] = b[i];
h_c[i] = c[i];
}
}
template <class T>
void OMP3Stream<T>::copy()
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
c[i] = a[i];
}
}
template <class T>
void OMP3Stream<T>::mul()
{
const T scalar = 3.0;
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
b[i] = scalar * c[i];
}
}
template <class T>
void OMP3Stream<T>::add()
{
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
c[i] = a[i] + b[i];
}
}
template <class T>
void OMP3Stream<T>::triad()
{
const T scalar = 3.0;
#pragma omp parallel for
for (int i = 0; i < array_size; i++)
{
a[i] = b[i] + scalar * c[i];
}
}
void listDevices(void)
{
std::cout << "0: CPU" << std::endl;
}
std::string getDeviceName(const int)
{
return std::string("Device name unavailable");
}
std::string getDeviceDriver(const int)
{
return std::string("Device driver unavailable");
}
template class OMP3Stream<float>;
template class OMP3Stream<double>;

35
OMP3Stream.h Normal file
View File

@ -0,0 +1,35 @@
#pragma once
#include <iostream>
#include <stdexcept>
#include "Stream.h"
#define IMPLEMENTATION_STRING "Reference OpenMP"
template <class T>
class OMP3Stream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// Device side pointers
T *a;
T *b;
T *c;
public:
OMP3Stream(const unsigned int, T*, T*, T*);
~OMP3Stream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};

144
SYCLStream.cpp Normal file
View File

@ -0,0 +1,144 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#include "SYCLStream.h"
#include <iostream>
using namespace cl::sycl;
template <class T>
SYCLStream<T>::SYCLStream(const unsigned int ARRAY_SIZE, const int device_index)
{
array_size = ARRAY_SIZE;
// Create buffers
d_a = buffer<T>(array_size);
d_b = buffer<T>(array_size);
d_c = buffer<T>(array_size);
}
template <class T>
SYCLStream<T>::~SYCLStream()
{
}
template <class T>
void SYCLStream<T>::copy()
{
queue.submit([&](handler &cgh)
{
auto ka = d_a.template get_access<access::read>(cgh);
auto kc = d_c.template get_access<access::write>(cgh);
cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
{
kc[index] = ka[index];
});
});
queue.wait();
}
template <class T>
void SYCLStream<T>::mul()
{
const T scalar = 3.0;
queue.submit([&](handler &cgh)
{
auto kb = d_b.template get_access<access::write>(cgh);
auto kc = d_c.template get_access<access::read>(cgh);
cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
{
kb[index] = scalar * kc[index];
});
});
queue.wait();
}
template <class T>
void SYCLStream<T>::add()
{
queue.submit([&](handler &cgh)
{
auto ka = d_a.template get_access<access::read>(cgh);
auto kb = d_b.template get_access<access::read>(cgh);
auto kc = d_c.template get_access<access::write>(cgh);
cgh.parallel_for(range<1>{array_size}, [=](id<1> index)
{
kc[index] = ka[index] + kb[index];
});
});
queue.wait();
}
template <class T>
void SYCLStream<T>::triad()
{
const T scalar = 3.0;
queue.submit([&](handler &cgh)
{
auto ka = d_a.template get_access<access::write>(cgh);
auto kb = d_b.template get_access<access::read>(cgh);
auto kc = d_c.template get_access<access::read>(cgh);
cgh.parallel_for(range<1>{array_size}, [=](id<1> index){
ka[index] = kb[index] + scalar * kc[index];
});
});
queue.wait();
}
template <class T>
void SYCLStream<T>::write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c)
{
auto _a = d_a.template get_access<access::write>();
auto _b = d_b.template get_access<access::write>();
auto _c = d_c.template get_access<access::write>();
for (int i = 0; i < array_size; i++)
{
_a[i] = a[i];
_b[i] = b[i];
_c[i] = c[i];
}
}
template <class T>
void SYCLStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{
auto _a = d_a.template get_access<access::read>();
auto _b = d_b.template get_access<access::read>();
auto _c = d_c.template get_access<access::read>();
for (int i = 0; i < array_size; i++)
{
a[i] = _a[i];
b[i] = _b[i];
c[i] = _c[i];
}
}
void listDevices(void)
{
// TODO: Get actual list of devices
std::cout << std::endl;
std::cout << "Devices:" << std::endl;
std::cout << "0: " << "triSYCL" << std::endl;
std::cout << std::endl;
}
std::string getDeviceName(const int device)
{
// TODO: Implement properly
return "triSYCL";
}
std::string getDeviceDriver(const int device)
{
// TODO: Implement properly
return "triSCYL";
}
template class SYCLStream<float>;
template class SYCLStream<double>;

45
SYCLStream.h Normal file
View File

@ -0,0 +1,45 @@
// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
// University of Bristol HPC
//
// For full license terms please see the LICENSE file distributed with this
// source code
#pragma once
#include "Stream.h"
#include "CL/sycl.hpp"
#define IMPLEMENTATION_STRING "SYCL"
template <class T>
class SYCLStream : public Stream<T>
{
protected:
// Size of arrays
unsigned int array_size;
// SYCL objects
cl::sycl::queue queue;
cl::sycl::buffer<T> d_a;
cl::sycl::buffer<T> d_b;
cl::sycl::buffer<T> d_c;
public:
SYCLStream(const unsigned int, const int);
~SYCLStream();
virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void write_arrays(const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& c) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
};
// Populate the devices list
void getDeviceList(void);

View File

@ -26,6 +26,12 @@
#include "RAJAStream.hpp"
#elif defined(KOKKOS)
#include "KOKKOSStream.hpp"
#elif defined(ACC)
#include "ACCStream.h"
#elif defined(SYCL)
#include "SYCLStream.h"
#elif defined(OMP3)
#include "OMP3Stream.h"
#endif
unsigned int ARRAY_SIZE = 52428800;
@ -91,6 +97,18 @@ void run()
// Use the Kokkos implementation
stream = new KOKKOSStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(ACC)
// Use the OpenACC implementation
stream = new ACCStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
#elif defined(SYCL)
// Use the SYCL implementation
stream = new SYCLStream<T>(ARRAY_SIZE, deviceIndex);
#elif defined(OMP3)
// Use the "reference" OpenMP 3 implementation
stream = new OMP3Stream<T>(ARRAY_SIZE, a.data(), b.data(), c.data());
#endif
stream->write_arrays(a, b, c);
@ -289,4 +307,3 @@ void parseArguments(int argc, char *argv[])
}
}
}