diff --git a/HC.make b/HC.make new file mode 100644 index 0000000..a79acf4 --- /dev/null +++ b/HC.make @@ -0,0 +1,21 @@ + +HCC = hcc + +CXXFLAGS+=-O3 $(shell hcc-config --cxxflags) +LDFLAGS+=$(shell hcc-config --ldflags) + +ifdef TBSIZE +CXXFLAGS+=-DVIRTUALTILESIZE=$(TBSIZE) +endif + +ifdef NTILES +CXXFLAGS+=-DNTILES=$(TBSIZE) +endif + + +hc-stream: main.cpp HCStream.cpp + $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ + +.PHONY: clean +clean: + rm -f hc-stream diff --git a/HCStream.cpp b/HCStream.cpp new file mode 100644 index 0000000..b1b4a9b --- /dev/null +++ b/HCStream.cpp @@ -0,0 +1,276 @@ +// Copyright (c) 2017 Peter Steinbach, MPI CBG Scientific Computing Facility +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "HCStream.h" + +#include +#include +#include +#include + +//specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115 +#ifndef VIRTUALTILESIZE +#define VIRTUALTILESIZE 256 +#endif + +//specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115 +#ifndef NTILES +#define NTILES 2048 +#endif + + +std::string getDeviceName(const hc::accelerator& _acc) +{ + std::wstring_convert, wchar_t> converter; + std::string value = converter.to_bytes(_acc.get_description()); + return value; +} + +void listDevices(void) +{ + // Get number of devices + std::vector accs = hc::accelerator::get_all(); + + // Print device names + if (accs.empty()) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < accs.size(); i++) + { + std::cout << i << ": " << getDeviceName(accs[i]) << std::endl; + } + std::cout << std::endl; + } +} + + +template +HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): + array_size(ARRAY_SIZE), + d_a(ARRAY_SIZE), + d_b(ARRAY_SIZE), + d_c(ARRAY_SIZE) +{ + + // The array size must be divisible by VIRTUALTILESIZE for kernel launches + if (ARRAY_SIZE % VIRTUALTILESIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << VIRTUALTILESIZE; + throw std::runtime_error(ss.str()); + } + + // Set device + std::vector accs = hc::accelerator::get_all(); + auto current = accs.at(device_index); + + hc::accelerator::set_default(current.get_device_path()); + + std::cout << "Using HC device " << getDeviceName(current) << std::endl; + +} + + +template +HCStream::~HCStream() +{ +} + +template +void HCStream::init_arrays(T _a, T _b, T _c) +{ + hc::array_view view_a(this->d_a); + hc::array_view view_b(this->d_b); + hc::array_view view_c(this->d_c); + + hc::completion_future future_a= hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_a[i] = _a; + }); + + hc::completion_future future_b= hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_b[i] = _b; + }); + hc::completion_future future_c= hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_c[i] = _c; + }); + try{ + future_a.wait(); + future_b.wait(); + future_c.wait(); + } + catch(std::exception& e){ + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::init_arrays " << e.what() << std::endl; + throw; + } + +} + +template +void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + hc::copy(d_a,a.begin()); + hc::copy(d_b,b.begin()); + hc::copy(d_c,c.begin()); +} + + +template +void HCStream::copy() +{ + + hc::array_view view_a = this->d_a; + hc::array_view view_c = this->d_c; + + try{ + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> index) [[hc]] { + view_c[index] = view_a[index]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; + throw; + } +} + +template +void HCStream::mul() +{ + + const T scalar = startScalar; + hc::array_view view_b = this->d_b; + hc::array_view view_c = this->d_c; + + try{ + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_b[i] = scalar*view_c[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::mul " << e.what() << std::endl; + throw; + } +} + +template +void HCStream::add() +{ + + + hc::array_view view_a(this->d_a); + hc::array_view view_b(this->d_b); + hc::array_view view_c(this->d_c); + + try{ + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_c[i] = view_a[i]+view_b[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::add " << e.what() << std::endl; + throw; + } +} + +template +void HCStream::triad() +{ + + const T scalar = startScalar; + hc::array_view view_a(this->d_a); + hc::array_view view_b(this->d_b); + hc::array_view view_c(this->d_c); + + try{ + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_a[i] = view_b[i] + scalar*view_c[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::triad " << e.what() << std::endl; + throw; + } +} + +template +T HCStream::dot() +{ + //implementation adapted from + //https://ampbook.codeplex.com/SourceControl/latest + // ->Samples/CaseStudies/Reduction + // ->CascadingReduction.h + + const auto& view_a = this->d_a; + const auto& view_b = this->d_b; + + auto ex = view_a.get_extent(); + const auto tiled_ex = hc::extent<1>(NTILES * VIRTUALTILESIZE).tile(VIRTUALTILESIZE); + const auto domain_sz = tiled_ex.size(); + + hc::array partial(NTILES); + + hc::parallel_for_each(tiled_ex, + [=, + &view_a, + &view_b, + &partial](const hc::tiled_index<1>& tidx) [[hc]] { + + auto gidx = tidx.global[0]; + T r = T{0}; // Assumes reduction op is addition. + while (gidx < view_a.get_extent().size()) { + r += view_a[gidx] * view_b[gidx]; + gidx += domain_sz; + } + + tile_static T tileData[VIRTUALTILESIZE]; + tileData[tidx.local[0]] = r; + + tidx.barrier.wait_with_tile_static_memory_fence(); + + for (auto h = VIRTUALTILESIZE / 2; h; h /= 2) { + if (tidx.local[0] < h) { + tileData[tidx.local[0]] += tileData[tidx.local[0] + h]; + } + tidx.barrier.wait_with_tile_static_memory_fence(); + } + + if (tidx.global == tidx.tile_origin) partial[tidx.tile] = tileData[0]; + }); + + try { + partial.get_accelerator_view().wait(); + } + catch (std::exception& e) { + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::dot " << e.what() << std::endl; + throw; + } + + std::vector h_partial(NTILES,0); + hc::copy(partial,h_partial.begin()); + + T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.); + + return result; + + +} + +template class HCStream; +template class HCStream; diff --git a/HCStream.h b/HCStream.h new file mode 100644 index 0000000..4bc2b18 --- /dev/null +++ b/HCStream.h @@ -0,0 +1,46 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" +#include "hc.hpp" + +#define IMPLEMENTATION_STRING "HC" + +template +class HCStream : public Stream +{ +protected: + // Size of arrays + unsigned int array_size; + // Device side pointers to arrays + hc::array d_a; + hc::array d_b; + hc::array d_c; + + +public: + + HCStream(const unsigned int, const int); + ~HCStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual T dot() override; + T dot_impl(); + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; diff --git a/main.cpp b/main.cpp index 33cef1e..8a2108c 100644 --- a/main.cpp +++ b/main.cpp @@ -23,6 +23,8 @@ #include "CUDAStream.h" #elif defined(HIP) #include "HIPStream.h" +#elif defined(HC) +#include "HCStream.h" #elif defined(OCL) #include "OCLStream.h" #elif defined(USE_RAJA) @@ -105,6 +107,10 @@ void run() // Use the HIP implementation stream = new HIPStream(ARRAY_SIZE, deviceIndex); +#elif defined(HC) + // Use the HC implementation + stream = new HCStream(ARRAY_SIZE, deviceIndex); + #elif defined(OCL) // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex);