// Copyright (c) 2017 Peter Steinbach, MPI CBG Scientific Computing Facility // // For full license terms please see the LICENSE file distributed with this // source code #include "HCStream.h" #include #include #include #include //specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115 #ifndef VIRTUALTILESIZE #define VIRTUALTILESIZE 256 #endif //specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115 #ifndef NTILES #define NTILES 2048 #endif std::string getDeviceName(const hc::accelerator& _acc) { std::wstring_convert, wchar_t> converter; std::string value = converter.to_bytes(_acc.get_description()); return value; } void listDevices(void) { // Get number of devices std::vector accs = hc::accelerator::get_all(); // Print device names if (accs.empty()) { std::cerr << "No devices found." << std::endl; } else { std::cout << std::endl; std::cout << "Devices:" << std::endl; for (int i = 0; i < accs.size(); i++) { std::cout << i << ": " << getDeviceName(accs[i]) << std::endl; } std::cout << std::endl; } } template HCStream::HCStream(const int ARRAY_SIZE, const int device_index): array_size(ARRAY_SIZE), d_a(ARRAY_SIZE), d_b(ARRAY_SIZE), d_c(ARRAY_SIZE) { // The array size must be divisible by VIRTUALTILESIZE for kernel launches if (ARRAY_SIZE % VIRTUALTILESIZE != 0) { std::stringstream ss; ss << "Array size must be a multiple of " << VIRTUALTILESIZE; throw std::runtime_error(ss.str()); } // Set device std::vector accs = hc::accelerator::get_all(); auto current = accs.at(device_index); hc::accelerator::set_default(current.get_device_path()); std::cout << "Using HC device " << getDeviceName(current) << std::endl; } template HCStream::~HCStream() { } template void HCStream::init_arrays(T _a, T _b, T _c) { hc::array_view view_a(this->d_a); hc::array_view view_b(this->d_b); hc::array_view view_c(this->d_c); hc::completion_future future_a= hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_a[i] = _a; }); hc::completion_future future_b= hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_b[i] = _b; }); hc::completion_future future_c= hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_c[i] = _c; }); try{ future_a.wait(); future_b.wait(); future_c.wait(); } catch(std::exception& e){ std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::init_arrays " << e.what() << std::endl; throw; } } template void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { hc::copy(d_a,a.begin()); hc::copy(d_b,b.begin()); hc::copy(d_c,c.begin()); } template void HCStream::copy() { hc::array_view view_a = this->d_a; hc::array_view view_c = this->d_c; try{ hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> index) [[hc]] { view_c[index] = view_a[index]; }); future_kernel.wait(); } catch(std::exception& e){ std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; throw; } } template void HCStream::mul() { const T scalar = startScalar; hc::array_view view_b = this->d_b; hc::array_view view_c = this->d_c; try{ hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_b[i] = scalar*view_c[i]; }); future_kernel.wait(); } catch(std::exception& e){ std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::mul " << e.what() << std::endl; throw; } } template void HCStream::add() { hc::array_view view_a(this->d_a); hc::array_view view_b(this->d_b); hc::array_view view_c(this->d_c); try{ hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_c[i] = view_a[i]+view_b[i]; }); future_kernel.wait(); } catch(std::exception& e){ std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::add " << e.what() << std::endl; throw; } } template void HCStream::triad() { const T scalar = startScalar; hc::array_view view_a(this->d_a); hc::array_view view_b(this->d_b); hc::array_view view_c(this->d_c); try{ hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_a[i] = view_b[i] + scalar*view_c[i]; }); future_kernel.wait(); } catch(std::exception& e){ std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::triad " << e.what() << std::endl; throw; } } template T HCStream::dot() { //implementation adapted from //https://ampbook.codeplex.com/SourceControl/latest // ->Samples/CaseStudies/Reduction // ->CascadingReduction.h const auto& view_a = this->d_a; const auto& view_b = this->d_b; auto ex = view_a.get_extent(); const auto tiled_ex = hc::extent<1>(NTILES * VIRTUALTILESIZE).tile(VIRTUALTILESIZE); const auto domain_sz = tiled_ex.size(); hc::array partial(NTILES); hc::parallel_for_each(tiled_ex, [=, &view_a, &view_b, &partial](const hc::tiled_index<1>& tidx) [[hc]] { auto gidx = tidx.global[0]; T r = T{0}; // Assumes reduction op is addition. while (gidx < view_a.get_extent().size()) { r += view_a[gidx] * view_b[gidx]; gidx += domain_sz; } tile_static T tileData[VIRTUALTILESIZE]; tileData[tidx.local[0]] = r; tidx.barrier.wait_with_tile_static_memory_fence(); for (auto h = VIRTUALTILESIZE / 2; h; h /= 2) { if (tidx.local[0] < h) { tileData[tidx.local[0]] += tileData[tidx.local[0] + h]; } tidx.barrier.wait_with_tile_static_memory_fence(); } if (tidx.global == tidx.tile_origin) partial[tidx.tile] = tileData[0]; }); try { partial.get_accelerator_view().wait(); } catch (std::exception& e) { std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::dot " << e.what() << std::endl; throw; } std::vector h_partial(NTILES,0); hc::copy(partial,h_partial.begin()); T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.); return result; } template class HCStream; template class HCStream;