From 0e45f8658839e0cd8bd1c3951e0a9004dd3a76d7 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 24 Mar 2017 15:19:48 +0100 Subject: [PATCH] added cascaded reduction based on C++AMP book --- HCStream.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 10000b2..4513814 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -205,34 +205,79 @@ template T HCStream::dot() { + //implementation adapted from + //https://ampbook.codeplex.com/SourceControl/latest + // ->Samples/CaseStudies/Reduction + // ->CascadingReduction.h + hc::array_view view_a(this->d_a); hc::array_view view_b(this->d_b); - hc::array d_sum(array_view); - hc::array_view view_s(d_sum) ; auto ex = view_a.get_extent(); - hc::tiled_extent<1> tiled_ex = ex.tile(64); + hc::tiled_extent<1> tiled_ex = ex.tile(TBSIZE); + + const size_t n_tiles = 64; + const size_t n_elements = array_size; + // hc::array d_product(array_size); + // hc::array_view view_p(d_product) ; + + hc::array partial(n_tiles*TBSIZE); + hc::array_view partialv(partial) ; + + hc::completion_future dot_kernel = hc::parallel_for_each(tiled_ex, + [=](hc::tiled_index<1> tidx) [[hc]] { + + std::size_t tid = tidx.local[0];//index in the tile + + tile_static T tileData[TBSIZE]; + + std::size_t i = (tidx.tile[0] * 2 * TBSIZE) + tid; + std::size_t stride = TBSIZE * 2 * n_tiles; + + // Load and add many elements, rather than just two + T sum = 0; + do + { + T near = view_a[i]*view_b[i]; + T far = view_a[i+TBSIZE]*view_b[i+TBSIZE]; + sum += (far + near); + i += stride; + } + while (i < n_elements); + tileData[tid] = sum; + + tidx.barrier.wait(); + + // Reduce values for data on this tile + for (stride = (TBSIZE / 2); stride > 0; stride >>= 1) + { + // Remember that this is a branch within a loop and all threads will have to execute + // this but only threads with a tid < stride will do useful work. + if (tid < stride) + tileData[tid] += tileData[tid + stride]; + + tidx.barrier.wait_with_tile_static_memory_fence(); + } + + // Write the result for this tile back to global memory + if (tid == 0) + partialv[tidx.tile[0]] = tileData[tid]; + }); try{ - hc::completion_future future_kernel = hc::parallel_for_each(tiled_ex, - [=](hc::index<1> i) [[hc]] { - view_s[i] = view_p[i]*view_a[i]; - }); - future_kernel.wait(); + dot_kernel.wait(); } catch(std::exception& e){ std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } - T sum = 0; - std::vector h_product(array_size,sum); - hc::copy(view_p,h_product.begin()); + std::vector h_partial(n_tiles); + hc::copy(partial, h_partial.begin()); + T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.); - sum = std::accumulate(h_product.begin(), h_product.end(),sum); - - return sum; + return result; }