From 7621f867017e1c4e4a503af56744fb186698f360 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 3 Jan 2017 11:43:12 +0100 Subject: [PATCH 01/23] added pure HC gpu stream implmentation --- CMakeLists.txt | 29 +++++++ HCStream.cpp | 213 +++++++++++++++++++++++++++++++++++++++++++++++++ HCStream.h | 44 ++++++++++ main.cpp | 6 ++ 4 files changed, 292 insertions(+) create mode 100644 HCStream.cpp create mode 100644 HCStream.h diff --git a/CMakeLists.txt b/CMakeLists.txt index efee733..5551c54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,15 @@ if(NOT DEFINED HIP_PATH) endif() set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +if(NOT DEFINED HCC_PATH) + if(NOT DEFINED ENV{HCC_PATH}) + set(HCC_PATH "/opt/rocm/" CACHE PATH "Path to which HCC has been installed") + else() + set(HCC_PATH $ENV{HCC_PATH} CACHE PATH "Path to which HCC has been installed") + endif() +endif() + + set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -49,6 +58,26 @@ else() message("Skipping HIP...") endif() +#------------------------------------------------------------------------------- +# HCC +#------------------------------------------------------------------------------- +find_program(HCC_BINARY hcc HINTS ${HCC_PATH}/bin PATHS ${HCC_PATH}/bin) +if(EXISTS ${HCC_BINARY}) + #can the following be tied to the target only? + set(CMAKE_CXX_COMPILER ${HCC_BINARY}) + # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include) + # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small ) + add_executable(gpu-stream-hc main.cpp HCStream.cpp) + set_target_properties(gpu-stream-hc PROPERTIES + COMPILE_FLAGS "-hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" + LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small" + ) + target_compile_definitions(gpu-stream-hc PUBLIC HC) +else() + message("Skipping HC...") +endif() + + #------------------------------------------------------------------------------- # CUDA #------------------------------------------------------------------------------- diff --git a/HCStream.cpp b/HCStream.cpp new file mode 100644 index 0000000..0c55af3 --- /dev/null +++ b/HCStream.cpp @@ -0,0 +1,213 @@ +// Copyright (c) 2015-16 Peter Steinbach, MPI CBG Scientific Computing Facility +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include +#include +#include + + +#include "HCStream.h" +//#include "hc.hpp" + +#define TBSIZE 1024 + +std::string getDeviceName(const hc::accelerator& _acc) +{ + std::wstring_convert, wchar_t> converter; + std::string value = converter.to_bytes(_acc.get_description()); + return value; +} + +void listDevices(void) +{ + // Get number of devices + std::vector accs = hc::accelerator::get_all(); + + // Print device names + if (accs.empty()) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < accs.size(); i++) + { + std::cout << i << ": " << getDeviceName(accs[i]) << std::endl; + } + std::cout << std::endl; + } +} + +// void check_error(void) +// { +// hipError_t err = hipGetLastError(); +// if (err != hipSuccess) +// { +// std::cerr << "Error: " << hipGetErrorString(err) << std::endl; +// exit(err); +// } +// } + +template +HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): + array_size(ARRAY_SIZE), + d_a(ARRAY_SIZE), + d_b(ARRAY_SIZE), + d_c(ARRAY_SIZE) +{ + + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // // Set device + std::vector accs = hc::accelerator::get_all(); + auto current = accs[device_index]; + + std::cout << "Using HC device " << getDeviceName(current) << std::endl; + + // // The array size must be divisible by TBSIZE for kernel launches + // if (ARRAY_SIZE % TBSIZE != 0) + // { + // std::stringstream ss; + // ss << "Array size must be a multiple of " << TBSIZE; + // throw std::runtime_error(ss.str()); + // } + + // // Set device + // int count; + // hipGetDeviceCount(&count); + // check_error(); + // if (device_index >= count) + // throw std::runtime_error("Invalid device index"); + // hipSetDevice(device_index); + // check_error(); + + // // Print out device information + // std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; + // std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; + + // array_size = ARRAY_SIZE; + + // // Check buffers fit on the device + // hipDeviceProp_t props; + // hipGetDeviceProperties(&props, 0); + // if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) + // throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // // Create device buffers + // hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + // check_error(); + // hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + // check_error(); + // hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + // check_error(); + + +} + + +template +HCStream::~HCStream() +{ +} + +template +void HCStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +{ + hc::copy(a.cbegin(),a.cend(),d_a); + hc::copy(b.cbegin(),b.cend(),d_b); + hc::copy(c.cbegin(),c.cend(),d_c); +} + +template +void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + // Copy device memory to host + hc::copy(d_a,a.begin()); + hc::copy(d_b,b.begin()); + hc::copy(d_c,c.begin()); +} + + +template +void HCStream::copy() +{ + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_c[i] = d_a[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template +void HCStream::mul() +{ + const T scalar = 0.3; + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_b[i] = scalar*d_c[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template +void HCStream::add() +{ + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_c[i] = d_a[i]+d_b[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template +void HCStream::triad() +{ + const T scalar = 0.3; + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + d_a[i] = d_b[i] + scalar*d_c[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << e.what() << std::endl; + throw; + } +} + +template class HCStream; +template class HCStream; diff --git a/HCStream.h b/HCStream.h new file mode 100644 index 0000000..a625be0 --- /dev/null +++ b/HCStream.h @@ -0,0 +1,44 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" +#include "hc.hpp" + +#define IMPLEMENTATION_STRING "HC" + +template +class HCStream : public Stream +{ +protected: + // Size of arrays + unsigned int array_size; + // Device side pointers to arrays + hc::array d_a; + hc::array d_b; + hc::array d_c; + + +public: + + HCStream(const unsigned int, const int); + ~HCStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + + virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; diff --git a/main.cpp b/main.cpp index 6a15aa7..50ab648 100644 --- a/main.cpp +++ b/main.cpp @@ -22,6 +22,8 @@ #include "CUDAStream.h" #elif defined(HIP) #include "HIPStream.h" +#elif defined(HC) +#include "HCStream.h" #elif defined(OCL) #include "OCLStream.h" #elif defined(USE_RAJA) @@ -105,6 +107,10 @@ void run() // Use the HIP implementation stream = new HIPStream(ARRAY_SIZE, deviceIndex); +#elif defined(HC) + // Use the HC implementation + stream = new HCStream(ARRAY_SIZE, deviceIndex); + #elif defined(OCL) // Use the OpenCL implementation stream = new OCLStream(ARRAY_SIZE, deviceIndex); From 47d2bf275f05cf5711ab506ac181b73c58eece79 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Thu, 12 Jan 2017 15:16:24 +0100 Subject: [PATCH 02/23] fixed setting the accelerator, CMakeLists still needs some tweeks to not build OpenCL --- CMakeLists.txt | 52 +++++++++++++++++++++++++++++++++----------------- HCStream.cpp | 32 ++++++++++++++++++++++++------- 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5551c54..c2bc5d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,24 +58,6 @@ else() message("Skipping HIP...") endif() -#------------------------------------------------------------------------------- -# HCC -#------------------------------------------------------------------------------- -find_program(HCC_BINARY hcc HINTS ${HCC_PATH}/bin PATHS ${HCC_PATH}/bin) -if(EXISTS ${HCC_BINARY}) - #can the following be tied to the target only? - set(CMAKE_CXX_COMPILER ${HCC_BINARY}) - # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include) - # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small ) - add_executable(gpu-stream-hc main.cpp HCStream.cpp) - set_target_properties(gpu-stream-hc PROPERTIES - COMPILE_FLAGS "-hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" - LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small" - ) - target_compile_definitions(gpu-stream-hc PUBLIC HC) -else() - message("Skipping HC...") -endif() #------------------------------------------------------------------------------- @@ -110,6 +92,9 @@ else () message("Skipping OpenCL...") endif () + + + #------------------------------------------------------------------------------- # OpenACC #------------------------------------------------------------------------------- @@ -187,6 +172,37 @@ else() message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)") endif() +#------------------------------------------------------------------------------- +# HCC +#------------------------------------------------------------------------------- +find_program(HCC_BINARY hcc HINTS ${HCC_PATH}/bin PATHS ${HCC_PATH}/bin) +if(EXISTS ${HCC_BINARY}) + #can the following be tied to the target only? + set(OLD_CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}) + set(CMAKE_CXX_COMPILER ${HCC_BINARY}) + + # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include) + # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small ) + add_executable(gpu-stream-hc main.cpp HCStream.cpp) + set_target_properties(gpu-stream-hc PROPERTIES + COMPILE_FLAGS "-hc -I/usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" + LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small" + ) + message("OpenCL magic: ${OpenCL_LIBRARY} ${OpenCL_INCLUDE_DIR}") + set_property(TARGET gpu-stream-ocl APPEND PROPERTY COMPILE_FLAGS "-I/opt/rocm/opencl/include/ -stdlib=libc++ -I/opt/rocm/hcc-lc/include") + set_property(TARGET gpu-stream-ocl APPEND PROPERTY LINK_FLAGS "-L/opt/rocm/opencl/lib/x86_64 -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi ") + # set_target_properties(gpu-stream-ocl PROPERTIES + # # COMPILE_FLAGS "-I/opt/rocm/opencl/include/opencl1.2" + # LINK_FLAGS "-I /opt/rocm/opencl/include/opencl1.2 -L /opt/rocm/opencl/lib/x86_64 -lOpenCL -std=c++11" + # ) + target_compile_definitions(gpu-stream-hc PUBLIC HC) + if(TARGET gpu-stream-ocl) + unset(gpu-stream-ocl) + endif() +else() + message("Skipping HC...") +endif() + #------------------------------------------------------------------------------- # SYCL #------------------------------------------------------------------------------- diff --git a/HCStream.cpp b/HCStream.cpp index 0c55af3..8cea1dc 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -72,7 +72,9 @@ HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): // // Set device std::vector accs = hc::accelerator::get_all(); auto current = accs[device_index]; - + + hc::accelerator::set_default(current.get_device_path()); + std::cout << "Using HC device " << getDeviceName(current) << std::endl; // // The array size must be divisible by TBSIZE for kernel launches @@ -142,11 +144,15 @@ void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector< template void HCStream::copy() { + + hc::array& device_a = this->d_a; + hc::array& device_c = this->d_c; + try{ - // launch a GPU kernel to compute the saxpy in parallel + // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) - , [&](hc::index<1> i) [[hc]] { - d_c[i] = d_a[i]; + , [&](hc::index<1> index) [[hc]] { + device_c[index] = device_a[index]; }); future_kernel.wait(); } @@ -160,11 +166,14 @@ template void HCStream::mul() { const T scalar = 0.3; + hc::array& device_b = this->d_b; + hc::array& device_c = this->d_c; + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [&](hc::index<1> i) [[hc]] { - d_b[i] = scalar*d_c[i]; + device_b[i] = scalar*device_c[i]; }); future_kernel.wait(); } @@ -177,11 +186,16 @@ void HCStream::mul() template void HCStream::add() { + + hc::array& device_a = this->d_a; + hc::array& device_b = this->d_b; + hc::array& device_c = this->d_c; + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [&](hc::index<1> i) [[hc]] { - d_c[i] = d_a[i]+d_b[i]; + device_c[i] = device_a[i]+device_b[i]; }); future_kernel.wait(); } @@ -195,11 +209,15 @@ template void HCStream::triad() { const T scalar = 0.3; + hc::array& device_a = this->d_a; + hc::array& device_b = this->d_b; + hc::array& device_c = this->d_c; + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [&](hc::index<1> i) [[hc]] { - d_a[i] = d_b[i] + scalar*d_c[i]; + device_a[i] = device_b[i] + scalar*device_c[i]; }); future_kernel.wait(); } From e520965bc57144ee3d5f5ef40d1b387553c73966 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 13 Jan 2017 10:53:51 +0100 Subject: [PATCH 03/23] all detected targets compile, opencl segfaults --- CMakeLists.txt | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2bc5d7..1c385e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,25 +180,20 @@ if(EXISTS ${HCC_BINARY}) #can the following be tied to the target only? set(OLD_CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}) set(CMAKE_CXX_COMPILER ${HCC_BINARY}) - - # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include) - # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small ) + add_executable(gpu-stream-hc main.cpp HCStream.cpp) set_target_properties(gpu-stream-hc PROPERTIES - COMPILE_FLAGS "-hc -I/usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" - LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small" + COMPILE_FLAGS "-hc -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" + LINK_FLAGS "-hc -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive" ) - message("OpenCL magic: ${OpenCL_LIBRARY} ${OpenCL_INCLUDE_DIR}") - set_property(TARGET gpu-stream-ocl APPEND PROPERTY COMPILE_FLAGS "-I/opt/rocm/opencl/include/ -stdlib=libc++ -I/opt/rocm/hcc-lc/include") - set_property(TARGET gpu-stream-ocl APPEND PROPERTY LINK_FLAGS "-L/opt/rocm/opencl/lib/x86_64 -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi ") - # set_target_properties(gpu-stream-ocl PROPERTIES - # # COMPILE_FLAGS "-I/opt/rocm/opencl/include/opencl1.2" - # LINK_FLAGS "-I /opt/rocm/opencl/include/opencl1.2 -L /opt/rocm/opencl/lib/x86_64 -lOpenCL -std=c++11" - # ) target_compile_definitions(gpu-stream-hc PUBLIC HC) - if(TARGET gpu-stream-ocl) - unset(gpu-stream-ocl) - endif() + + set_property(TARGET gpu-stream-ocl APPEND PROPERTY COMPILE_FLAGS "-I/opt/rocm/opencl/include/ -stdlib=libstdc++") + set_property(TARGET gpu-stream-ocl APPEND PROPERTY LINK_FLAGS "-L/opt/rocm/opencl/lib/x86_64 -lstdc++ ") + + set_property(TARGET gpu-stream-omp3 APPEND PROPERTY COMPILE_FLAGS "-I/opt/rocm/opencl/include/ -stdlib=libstdc++") + set_property(TARGET gpu-stream-omp3 APPEND PROPERTY LINK_FLAGS "-L/opt/rocm/opencl/lib/x86_64 -lstdc++ ") + else() message("Skipping HC...") endif() From 0fc67226845819e8ef47166c790e81be2ae163f0 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 27 Feb 2017 16:35:03 +0100 Subject: [PATCH 04/23] added Makefile and code for HC --- HC.make | 16 ++++++++ HCStream.cpp | 104 +++++++++++++++++++++++---------------------------- HCStream.h | 3 +- 3 files changed, 64 insertions(+), 59 deletions(-) create mode 100644 HC.make diff --git a/HC.make b/HC.make new file mode 100644 index 0000000..3ec099e --- /dev/null +++ b/HC.make @@ -0,0 +1,16 @@ + +# TODO: HC with HCC + +HCC = hcc + + + +CXXFLAGS+=-hc -stdlib=libc++ -I/opt/rocm/hcc-lc/include +LDFLAGS+=-hc -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive + +hc-stream: main.cpp HCStream.cpp + $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ + +.PHONY: clean +clean: + rm -f hc-stream diff --git a/HCStream.cpp b/HCStream.cpp index 8cea1dc..2aa617c 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -7,10 +7,9 @@ #include #include #include - +#include #include "HCStream.h" -//#include "hc.hpp" #define TBSIZE 1024 @@ -25,7 +24,7 @@ void listDevices(void) { // Get number of devices std::vector accs = hc::accelerator::get_all(); - + // Print device names if (accs.empty()) { @@ -43,15 +42,6 @@ void listDevices(void) } } -// void check_error(void) -// { -// hipError_t err = hipGetLastError(); -// if (err != hipSuccess) -// { -// std::cerr << "Error: " << hipGetErrorString(err) << std::endl; -// exit(err); -// } -// } template HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): @@ -76,45 +66,7 @@ HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): hc::accelerator::set_default(current.get_device_path()); std::cout << "Using HC device " << getDeviceName(current) << std::endl; - - // // The array size must be divisible by TBSIZE for kernel launches - // if (ARRAY_SIZE % TBSIZE != 0) - // { - // std::stringstream ss; - // ss << "Array size must be a multiple of " << TBSIZE; - // throw std::runtime_error(ss.str()); - // } - // // Set device - // int count; - // hipGetDeviceCount(&count); - // check_error(); - // if (device_index >= count) - // throw std::runtime_error("Invalid device index"); - // hipSetDevice(device_index); - // check_error(); - - // // Print out device information - // std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; - // std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; - - // array_size = ARRAY_SIZE; - - // // Check buffers fit on the device - // hipDeviceProp_t props; - // hipGetDeviceProperties(&props, 0); - // if (props.totalGlobalMem < 3*ARRAY_SIZE*sizeof(T)) - // throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - - // // Create device buffers - // hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); - // check_error(); - // hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); - // check_error(); - // hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); - // check_error(); - - } @@ -124,11 +76,17 @@ HCStream::~HCStream() } template -void HCStream::write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) +void HCStream::init_arrays(T _a, T _b, T _c) { - hc::copy(a.cbegin(),a.cend(),d_a); - hc::copy(b.cbegin(),b.cend(),d_b); - hc::copy(c.cbegin(),c.cend(),d_c); + std::vector temp(array_size,_a); + hc::copy(temp.begin(), temp.end(),this->d_a); + + std::fill(temp.begin(), temp.end(),_b); + hc::copy(temp.begin(), temp.end(),this->d_b); + + std::fill(temp.begin(), temp.end(),_c); + hc::copy(temp.begin(), temp.end(),this->d_c); + } template @@ -157,7 +115,7 @@ void HCStream::copy() future_kernel.wait(); } catch(std::exception& e){ - std::cout << e.what() << std::endl; + std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } @@ -178,7 +136,7 @@ void HCStream::mul() future_kernel.wait(); } catch(std::exception& e){ - std::cout << e.what() << std::endl; + std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } @@ -200,7 +158,7 @@ void HCStream::add() future_kernel.wait(); } catch(std::exception& e){ - std::cout << e.what() << std::endl; + std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } @@ -222,10 +180,40 @@ void HCStream::triad() future_kernel.wait(); } catch(std::exception& e){ - std::cout << e.what() << std::endl; + std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } +template +T HCStream::dot() +{ + hc::array& device_a = this->d_a; + hc::array product = this->d_b; + + T sum = static_cast(0); + + try{ + // launch a GPU kernel to compute the saxpy in parallel + hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) + , [&](hc::index<1> i) [[hc]] { + product[i] *= device_a[i]; + }); + future_kernel.wait(); + } + catch(std::exception& e){ + std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + throw; + } + + std::vector h_product(array_size,sum); + hc::copy(product,h_product.begin()); + + sum = std::accumulate(h_product.begin(), h_product.end(),sum); + + return sum; +} + + template class HCStream; template class HCStream; diff --git a/HCStream.h b/HCStream.h index a625be0..950b370 100644 --- a/HCStream.h +++ b/HCStream.h @@ -37,8 +37,9 @@ public: virtual void add() override; virtual void mul() override; virtual void triad() override; + virtual T dot() override; - virtual void write_arrays(const std::vector& a, const std::vector& b, const std::vector& c) override; + virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; From f7af8ebc91ac6602240d5c9696fab408c97bf049 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 27 Feb 2017 17:01:38 +0100 Subject: [PATCH 05/23] added printf style error messages to nail down memory access problems --- HCStream.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 2aa617c..a91f6a3 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -78,6 +78,7 @@ HCStream::~HCStream() template void HCStream::init_arrays(T _a, T _b, T _c) { + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "initializing arrays\n"; std::vector temp(array_size,_a); hc::copy(temp.begin(), temp.end(),this->d_a); @@ -93,6 +94,7 @@ template void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { // Copy device memory to host + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "read arrays\n"; hc::copy(d_a,a.begin()); hc::copy(d_b,b.begin()); hc::copy(d_c,c.begin()); @@ -103,6 +105,7 @@ template void HCStream::copy() { + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "copy\n"; hc::array& device_a = this->d_a; hc::array& device_c = this->d_c; @@ -123,6 +126,7 @@ void HCStream::copy() template void HCStream::mul() { + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "mul\n"; const T scalar = 0.3; hc::array& device_b = this->d_b; hc::array& device_c = this->d_c; @@ -144,7 +148,7 @@ void HCStream::mul() template void HCStream::add() { - + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "add\n"; hc::array& device_a = this->d_a; hc::array& device_b = this->d_b; hc::array& device_c = this->d_c; @@ -166,6 +170,7 @@ void HCStream::add() template void HCStream::triad() { + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "triad\n"; const T scalar = 0.3; hc::array& device_a = this->d_a; hc::array& device_b = this->d_b; @@ -188,16 +193,17 @@ void HCStream::triad() template T HCStream::dot() { - hc::array& device_a = this->d_a; - hc::array product = this->d_b; + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "dot\n"; + hc::array_view view_a(this->d_a); + hc::array_view view_p(this->d_b); T sum = static_cast(0); try{ // launch a GPU kernel to compute the saxpy in parallel - hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) - , [&](hc::index<1> i) [[hc]] { - product[i] *= device_a[i]; + hc::completion_future future_kernel = hc::parallel_for_each(view_a.get_extent(), + [&](hc::index<1> i) [[hc]] { + view_p[i] = view_p[i]*view_a[i]; }); future_kernel.wait(); } @@ -206,8 +212,11 @@ T HCStream::dot() throw; } + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "dot - for_each done\n"; std::vector h_product(array_size,sum); - hc::copy(product,h_product.begin()); + hc::copy(view_p,h_product.begin()); + + std::cout << __FILE__ << ":" << __LINE__ << "\t" << "dot - copy-out done\n"; sum = std::accumulate(h_product.begin(), h_product.end(),sum); From 383c5a3ae70fa767ae91382a8db9d59eab6561d4 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 28 Feb 2017 10:00:44 +0100 Subject: [PATCH 06/23] all required operations implemented, errors are too large --- HCStream.cpp | 108 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 34 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index a91f6a3..6c59fe0 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -78,7 +78,49 @@ HCStream::~HCStream() template void HCStream::init_arrays(T _a, T _b, T _c) { - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "initializing arrays\n"; + // hc::array_view view_a(this->d_a); + // hc::array_view view_b(this->d_b); + // hc::array_view view_c(this->d_c); + + // hc::completion_future future_a= hc::parallel_for_each(hc::extent<1>(array_size) + // , [=](hc::index<1> i) [[hc]] { + // view_a[i] = _a; + // }); + + // hc::completion_future future_b= hc::parallel_for_each(hc::extent<1>(array_size) + // , [=](hc::index<1> i) [[hc]] { + // view_b[i] = _b; + // }); + // hc::completion_future future_c= hc::parallel_for_each(hc::extent<1>(array_size) + // , [=](hc::index<1> i) [[hc]] { + // view_c[i] = _c; + // }); + // try{ + // future_a.wait(); + // } + // catch(std::exception& e){ + // std::cout << __FILE__ << ":" << __LINE__ << "\t future_a " << e.what() << std::endl; + // throw; + // } + + // try{ + // future_b.wait(); + // } + // catch(std::exception& e){ + // std::cout << __FILE__ << ":" << __LINE__ << "\t future_b " << e.what() << std::endl; + // throw; + // } + + + // try{ + // future_c.wait(); + // } + // catch(std::exception& e){ + // std::cout << __FILE__ << ":" << __LINE__ << "\t future_c " << e.what() << std::endl; + // throw; + // } + + std::vector temp(array_size,_a); hc::copy(temp.begin(), temp.end(),this->d_a); @@ -93,8 +135,8 @@ void HCStream::init_arrays(T _a, T _b, T _c) template void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { + // Copy device memory to host - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "read arrays\n"; hc::copy(d_a,a.begin()); hc::copy(d_b,b.begin()); hc::copy(d_c,c.begin()); @@ -105,20 +147,19 @@ template void HCStream::copy() { - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "copy\n"; - hc::array& device_a = this->d_a; - hc::array& device_c = this->d_c; + hc::array_view view_a = this->d_a; + hc::array_view view_c = this->d_c; try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) - , [&](hc::index<1> index) [[hc]] { - device_c[index] = device_a[index]; + , [=](hc::index<1> index) [[hc]] { + view_c[index] = view_a[index]; }); future_kernel.wait(); } catch(std::exception& e){ - std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } @@ -126,21 +167,21 @@ void HCStream::copy() template void HCStream::mul() { - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "mul\n"; + const T scalar = 0.3; - hc::array& device_b = this->d_b; - hc::array& device_c = this->d_c; + hc::array_view view_b = this->d_b; + hc::array_view view_c = this->d_c; try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) - , [&](hc::index<1> i) [[hc]] { - device_b[i] = scalar*device_c[i]; + , [=](hc::index<1> i) [[hc]] { + view_b[i] = scalar*view_c[i]; }); future_kernel.wait(); } catch(std::exception& e){ - std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } @@ -148,21 +189,22 @@ void HCStream::mul() template void HCStream::add() { - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "add\n"; - hc::array& device_a = this->d_a; - hc::array& device_b = this->d_b; - hc::array& device_c = this->d_c; + + + hc::array_view view_a(this->d_a); + hc::array_view view_b(this->d_b); + hc::array_view view_c(this->d_c); try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) - , [&](hc::index<1> i) [[hc]] { - device_c[i] = device_a[i]+device_b[i]; + , [=](hc::index<1> i) [[hc]] { + view_c[i] = view_a[i]+view_b[i]; }); future_kernel.wait(); } catch(std::exception& e){ - std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } @@ -170,22 +212,22 @@ void HCStream::add() template void HCStream::triad() { - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "triad\n"; + const T scalar = 0.3; - hc::array& device_a = this->d_a; - hc::array& device_b = this->d_b; - hc::array& device_c = this->d_c; + hc::array_view view_a(this->d_a); + hc::array_view view_b(this->d_b); + hc::array_view view_c(this->d_c); try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) - , [&](hc::index<1> i) [[hc]] { - device_a[i] = device_b[i] + scalar*device_c[i]; + , [=](hc::index<1> i) [[hc]] { + view_a[i] = view_b[i] + scalar*view_c[i]; }); future_kernel.wait(); } catch(std::exception& e){ - std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } } @@ -193,7 +235,7 @@ void HCStream::triad() template T HCStream::dot() { - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "dot\n"; + hc::array_view view_a(this->d_a); hc::array_view view_p(this->d_b); @@ -202,22 +244,20 @@ T HCStream::dot() try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(view_a.get_extent(), - [&](hc::index<1> i) [[hc]] { + [=](hc::index<1> i) [[hc]] { view_p[i] = view_p[i]*view_a[i]; }); future_kernel.wait(); } catch(std::exception& e){ - std::cout << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "dot - for_each done\n"; + std::vector h_product(array_size,sum); hc::copy(view_p,h_product.begin()); - std::cout << __FILE__ << ":" << __LINE__ << "\t" << "dot - copy-out done\n"; - sum = std::accumulate(h_product.begin(), h_product.end(),sum); return sum; From 3fc0b57a2c26eb628094d92abcf70d07749ddf3b Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 28 Feb 2017 13:31:37 +0100 Subject: [PATCH 07/23] do initial assignment through parallel_for_each --- HCStream.cpp | 80 ++++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 56 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 6c59fe0..5de5fb4 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -78,65 +78,38 @@ HCStream::~HCStream() template void HCStream::init_arrays(T _a, T _b, T _c) { - // hc::array_view view_a(this->d_a); - // hc::array_view view_b(this->d_b); - // hc::array_view view_c(this->d_c); + hc::array_view view_a(this->d_a); + hc::array_view view_b(this->d_b); + hc::array_view view_c(this->d_c); - // hc::completion_future future_a= hc::parallel_for_each(hc::extent<1>(array_size) - // , [=](hc::index<1> i) [[hc]] { - // view_a[i] = _a; - // }); + hc::completion_future future_a= hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_a[i] = _a; + }); - // hc::completion_future future_b= hc::parallel_for_each(hc::extent<1>(array_size) - // , [=](hc::index<1> i) [[hc]] { - // view_b[i] = _b; - // }); - // hc::completion_future future_c= hc::parallel_for_each(hc::extent<1>(array_size) - // , [=](hc::index<1> i) [[hc]] { - // view_c[i] = _c; - // }); - // try{ - // future_a.wait(); - // } - // catch(std::exception& e){ - // std::cout << __FILE__ << ":" << __LINE__ << "\t future_a " << e.what() << std::endl; - // throw; - // } - - // try{ - // future_b.wait(); - // } - // catch(std::exception& e){ - // std::cout << __FILE__ << ":" << __LINE__ << "\t future_b " << e.what() << std::endl; - // throw; - // } - - - // try{ - // future_c.wait(); - // } - // catch(std::exception& e){ - // std::cout << __FILE__ << ":" << __LINE__ << "\t future_c " << e.what() << std::endl; - // throw; - // } - - - std::vector temp(array_size,_a); - hc::copy(temp.begin(), temp.end(),this->d_a); - - std::fill(temp.begin(), temp.end(),_b); - hc::copy(temp.begin(), temp.end(),this->d_b); - - std::fill(temp.begin(), temp.end(),_c); - hc::copy(temp.begin(), temp.end(),this->d_c); + hc::completion_future future_b= hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_b[i] = _b; + }); + hc::completion_future future_c= hc::parallel_for_each(hc::extent<1>(array_size) + , [=](hc::index<1> i) [[hc]] { + view_c[i] = _c; + }); + try{ + future_a.wait(); + future_b.wait(); + future_c.wait(); + } + catch(std::exception& e){ + std::cout << __FILE__ << ":" << __LINE__ << "\t future_{a,b,c} " << e.what() << std::endl; + throw; + } } template void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { - - // Copy device memory to host hc::copy(d_a,a.begin()); hc::copy(d_b,b.begin()); hc::copy(d_c,c.begin()); @@ -151,7 +124,6 @@ void HCStream::copy() hc::array_view view_c = this->d_c; try{ - // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> index) [[hc]] { view_c[index] = view_a[index]; @@ -173,7 +145,6 @@ void HCStream::mul() hc::array_view view_c = this->d_c; try{ - // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_b[i] = scalar*view_c[i]; @@ -196,7 +167,6 @@ void HCStream::add() hc::array_view view_c(this->d_c); try{ - // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_c[i] = view_a[i]+view_b[i]; @@ -219,7 +189,6 @@ void HCStream::triad() hc::array_view view_c(this->d_c); try{ - // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [=](hc::index<1> i) [[hc]] { view_a[i] = view_b[i] + scalar*view_c[i]; @@ -242,7 +211,6 @@ T HCStream::dot() T sum = static_cast(0); try{ - // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(view_a.get_extent(), [=](hc::index<1> i) [[hc]] { view_p[i] = view_p[i]*view_a[i]; From 0535cbcd5b9c4fccf6d46480bf6a4cf3839de322 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Thu, 23 Mar 2017 15:55:23 +0100 Subject: [PATCH 08/23] renamed variables and introduced views --- HCStream.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 5de5fb4..10000b2 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -206,14 +206,18 @@ T HCStream::dot() { hc::array_view view_a(this->d_a); - hc::array_view view_p(this->d_b); + hc::array_view view_b(this->d_b); + hc::array d_sum(array_view); + hc::array_view view_s(d_sum) ; - T sum = static_cast(0); + auto ex = view_a.get_extent(); + hc::tiled_extent<1> tiled_ex = ex.tile(64); try{ - hc::completion_future future_kernel = hc::parallel_for_each(view_a.get_extent(), + hc::completion_future future_kernel = hc::parallel_for_each(tiled_ex, [=](hc::index<1> i) [[hc]] { - view_p[i] = view_p[i]*view_a[i]; + + view_s[i] = view_p[i]*view_a[i]; }); future_kernel.wait(); } @@ -222,7 +226,7 @@ T HCStream::dot() throw; } - + T sum = 0; std::vector h_product(array_size,sum); hc::copy(view_p,h_product.begin()); From 96bc566ce13494a0d237db32964a9b3af6ebfd4e Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 24 Mar 2017 15:19:22 +0100 Subject: [PATCH 09/23] added debug flag --- HC.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HC.make b/HC.make index 3ec099e..7a37576 100644 --- a/HC.make +++ b/HC.make @@ -6,7 +6,7 @@ HCC = hcc CXXFLAGS+=-hc -stdlib=libc++ -I/opt/rocm/hcc-lc/include -LDFLAGS+=-hc -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive +LDFLAGS+=-g -fstandalone-debug -hc -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive hc-stream: main.cpp HCStream.cpp $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ From 0e45f8658839e0cd8bd1c3951e0a9004dd3a76d7 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Fri, 24 Mar 2017 15:19:48 +0100 Subject: [PATCH 10/23] added cascaded reduction based on C++AMP book --- HCStream.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 10000b2..4513814 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -205,34 +205,79 @@ template T HCStream::dot() { + //implementation adapted from + //https://ampbook.codeplex.com/SourceControl/latest + // ->Samples/CaseStudies/Reduction + // ->CascadingReduction.h + hc::array_view view_a(this->d_a); hc::array_view view_b(this->d_b); - hc::array d_sum(array_view); - hc::array_view view_s(d_sum) ; auto ex = view_a.get_extent(); - hc::tiled_extent<1> tiled_ex = ex.tile(64); + hc::tiled_extent<1> tiled_ex = ex.tile(TBSIZE); + + const size_t n_tiles = 64; + const size_t n_elements = array_size; + // hc::array d_product(array_size); + // hc::array_view view_p(d_product) ; + + hc::array partial(n_tiles*TBSIZE); + hc::array_view partialv(partial) ; + + hc::completion_future dot_kernel = hc::parallel_for_each(tiled_ex, + [=](hc::tiled_index<1> tidx) [[hc]] { + + std::size_t tid = tidx.local[0];//index in the tile + + tile_static T tileData[TBSIZE]; + + std::size_t i = (tidx.tile[0] * 2 * TBSIZE) + tid; + std::size_t stride = TBSIZE * 2 * n_tiles; + + // Load and add many elements, rather than just two + T sum = 0; + do + { + T near = view_a[i]*view_b[i]; + T far = view_a[i+TBSIZE]*view_b[i+TBSIZE]; + sum += (far + near); + i += stride; + } + while (i < n_elements); + tileData[tid] = sum; + + tidx.barrier.wait(); + + // Reduce values for data on this tile + for (stride = (TBSIZE / 2); stride > 0; stride >>= 1) + { + // Remember that this is a branch within a loop and all threads will have to execute + // this but only threads with a tid < stride will do useful work. + if (tid < stride) + tileData[tid] += tileData[tid + stride]; + + tidx.barrier.wait_with_tile_static_memory_fence(); + } + + // Write the result for this tile back to global memory + if (tid == 0) + partialv[tidx.tile[0]] = tileData[tid]; + }); try{ - hc::completion_future future_kernel = hc::parallel_for_each(tiled_ex, - [=](hc::index<1> i) [[hc]] { - view_s[i] = view_p[i]*view_a[i]; - }); - future_kernel.wait(); + dot_kernel.wait(); } catch(std::exception& e){ std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; throw; } - T sum = 0; - std::vector h_product(array_size,sum); - hc::copy(view_p,h_product.begin()); + std::vector h_partial(n_tiles); + hc::copy(partial, h_partial.begin()); + T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.); - sum = std::accumulate(h_product.begin(), h_product.end(),sum); - - return sum; + return result; } From 55f467e24dd9e54268d1e4f090f974b599c20977 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 27 Mar 2017 14:22:56 +0200 Subject: [PATCH 11/23] moved experimental dot product implementation of dot_impl which is build only if -DHC_DEVELOP is given --- HC.make | 2 -- HCStream.cpp | 11 ++++++++++- HCStream.h | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/HC.make b/HC.make index 7a37576..1694a35 100644 --- a/HC.make +++ b/HC.make @@ -3,8 +3,6 @@ HCC = hcc - - CXXFLAGS+=-hc -stdlib=libc++ -I/opt/rocm/hcc-lc/include LDFLAGS+=-g -fstandalone-debug -hc -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive diff --git a/HCStream.cpp b/HCStream.cpp index 4513814..659cee1 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -202,7 +202,7 @@ void HCStream::triad() } template -T HCStream::dot() +T HCStream::dot_impl() { //implementation adapted from @@ -280,6 +280,15 @@ T HCStream::dot() return result; } +template +T HCStream::dot() +{ + #ifdef HC_DEVELOP + return dot_impl(); + #else + return 0.; + #endif +} template class HCStream; template class HCStream; diff --git a/HCStream.h b/HCStream.h index 950b370..4bc2b18 100644 --- a/HCStream.h +++ b/HCStream.h @@ -38,6 +38,7 @@ public: virtual void mul() override; virtual void triad() override; virtual T dot() override; + T dot_impl(); virtual void init_arrays(T initA, T initB, T initC) override; virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; From fd35d895d9b84d7c8e74ec25ea98b509e2652167 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 3 Apr 2017 14:16:06 +0200 Subject: [PATCH 12/23] added optimized flags to CXXFLAGS --- HC.make | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HC.make b/HC.make index 1694a35..5f9deb2 100644 --- a/HC.make +++ b/HC.make @@ -3,8 +3,8 @@ HCC = hcc -CXXFLAGS+=-hc -stdlib=libc++ -I/opt/rocm/hcc-lc/include -LDFLAGS+=-g -fstandalone-debug -hc -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive +CXXFLAGS+=-O3 -hc -stdlib=libc++ -I/opt/rocm/hcc-lc/include +LDFLAGS+=-L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive hc-stream: main.cpp HCStream.cpp $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ From 04589d4d4fe5f3099a335f0b59174cc0c2d94653 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 3 Apr 2017 14:16:25 +0200 Subject: [PATCH 13/23] added fixed bug in dot product --- HCStream.cpp | 161 ++++++++++++++++++++++----------------------------- 1 file changed, 68 insertions(+), 93 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 659cee1..3c2b9c8 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -3,14 +3,13 @@ // For full license terms please see the LICENSE file distributed with this // source code +#include "HCStream.h" #include #include #include #include -#include "HCStream.h" - #define TBSIZE 1024 std::string getDeviceName(const hc::accelerator& _acc) @@ -101,7 +100,7 @@ void HCStream::init_arrays(T _a, T _b, T _c) future_c.wait(); } catch(std::exception& e){ - std::cout << __FILE__ << ":" << __LINE__ << "\t future_{a,b,c} " << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::init_arrays " << e.what() << std::endl; throw; } @@ -131,7 +130,7 @@ void HCStream::copy() future_kernel.wait(); } catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; throw; } } @@ -140,7 +139,7 @@ template void HCStream::mul() { - const T scalar = 0.3; + const T scalar = startScalar; hc::array_view view_b = this->d_b; hc::array_view view_c = this->d_c; @@ -152,7 +151,7 @@ void HCStream::mul() future_kernel.wait(); } catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; throw; } } @@ -174,7 +173,7 @@ void HCStream::add() future_kernel.wait(); } catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; throw; } } @@ -183,7 +182,7 @@ template void HCStream::triad() { - const T scalar = 0.3; + const T scalar = startScalar; hc::array_view view_a(this->d_a); hc::array_view view_b(this->d_b); hc::array_view view_c(this->d_c); @@ -196,98 +195,74 @@ void HCStream::triad() future_kernel.wait(); } catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; throw; } } -template -T HCStream::dot_impl() -{ - - //implementation adapted from - //https://ampbook.codeplex.com/SourceControl/latest - // ->Samples/CaseStudies/Reduction - // ->CascadingReduction.h - - hc::array_view view_a(this->d_a); - hc::array_view view_b(this->d_b); - - auto ex = view_a.get_extent(); - hc::tiled_extent<1> tiled_ex = ex.tile(TBSIZE); - - const size_t n_tiles = 64; - const size_t n_elements = array_size; - // hc::array d_product(array_size); - // hc::array_view view_p(d_product) ; - - hc::array partial(n_tiles*TBSIZE); - hc::array_view partialv(partial) ; - - hc::completion_future dot_kernel = hc::parallel_for_each(tiled_ex, - [=](hc::tiled_index<1> tidx) [[hc]] { - - std::size_t tid = tidx.local[0];//index in the tile - - tile_static T tileData[TBSIZE]; - - std::size_t i = (tidx.tile[0] * 2 * TBSIZE) + tid; - std::size_t stride = TBSIZE * 2 * n_tiles; - - // Load and add many elements, rather than just two - T sum = 0; - do - { - T near = view_a[i]*view_b[i]; - T far = view_a[i+TBSIZE]*view_b[i+TBSIZE]; - sum += (far + near); - i += stride; - } - while (i < n_elements); - tileData[tid] = sum; - - tidx.barrier.wait(); - - // Reduce values for data on this tile - for (stride = (TBSIZE / 2); stride > 0; stride >>= 1) - { - // Remember that this is a branch within a loop and all threads will have to execute - // this but only threads with a tid < stride will do useful work. - if (tid < stride) - tileData[tid] += tileData[tid + stride]; - - tidx.barrier.wait_with_tile_static_memory_fence(); - } - - // Write the result for this tile back to global memory - if (tid == 0) - partialv[tidx.tile[0]] = tileData[tid]; - }); - - try{ - - dot_kernel.wait(); - } - catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t" << e.what() << std::endl; - throw; - } - - std::vector h_partial(n_tiles); - hc::copy(partial, h_partial.begin()); - T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.); - - return result; -} - template T HCStream::dot() { - #ifdef HC_DEVELOP - return dot_impl(); - #else - return 0.; - #endif + //implementation adapted from + //https://ampbook.codeplex.com/SourceControl/latest + // ->Samples/CaseStudies/Reduction + // ->CascadingReduction.h + + static constexpr std::size_t n_tiles = 64; + + const auto& view_a = this->d_a; + const auto& view_b = this->d_b; + + auto ex = view_a.get_extent(); + const auto tiled_ex = hc::extent<1>(n_tiles * TBSIZE).tile(TBSIZE); + const auto domain_sz = tiled_ex.size(); + + hc::array partial(n_tiles); + + hc::parallel_for_each(tiled_ex, + [=, + &view_a, + &view_b, + &partial](const hc::tiled_index<1>& tidx) [[hc]] { + + auto gidx = tidx.global[0]; + T r = T{0}; // Assumes reduction op is addition. + while (gidx < view_a.get_extent().size()) { + r += view_a[gidx] * view_b[gidx]; + gidx += domain_sz; + } + + tile_static T tileData[TBSIZE]; + tileData[tidx.local[0]] = r; + + tidx.barrier.wait_with_tile_static_memory_fence(); + + for (auto h = TBSIZE / 2; h; h /= 2) { + if (tidx.local[0] < h) { + tileData[tidx.local[0]] += tileData[tidx.local[0] + h]; + } + tidx.barrier.wait_with_tile_static_memory_fence(); + } + + if (tidx.global == tidx.tile_origin) partial[tidx.tile] = tileData[0]; + }); + + try { + partial.get_accelerator_view().wait(); + } + catch (std::exception& e) { + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::dot " << e.what() << std::endl; + throw; + } + + std::vector h_partial(n_tiles,0); + hc::copy(partial,h_partial.begin()); + + T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.); + + return result; + + } template class HCStream; From b1fc309f3ac9953ddf3e8f93c2ae60ddb7ae9c66 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 18 Apr 2017 14:57:35 +0200 Subject: [PATCH 14/23] removed obsolete comment --- HC.make | 2 -- 1 file changed, 2 deletions(-) diff --git a/HC.make b/HC.make index 5f9deb2..72eb68a 100644 --- a/HC.make +++ b/HC.make @@ -1,6 +1,4 @@ -# TODO: HC with HCC - HCC = hcc CXXFLAGS+=-O3 -hc -stdlib=libc++ -I/opt/rocm/hcc-lc/include From f6d6874c04368a793f14939008740bc03de01901 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 18 Apr 2017 14:58:34 +0200 Subject: [PATCH 15/23] dropped obsolete comments --- HCStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HCStream.cpp b/HCStream.cpp index 3c2b9c8..0b104a3 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -58,7 +58,7 @@ HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): throw std::runtime_error(ss.str()); } - // // Set device + // Set device std::vector accs = hc::accelerator::get_all(); auto current = accs[device_index]; From a971591cc928b904107d114a316f169ae1629314 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 18 Apr 2017 15:00:29 +0200 Subject: [PATCH 16/23] replaced operator[] by .at method to trigger an exception in case of out-of-range errors --- HCStream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HCStream.cpp b/HCStream.cpp index 0b104a3..b574f57 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -60,7 +60,7 @@ HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): // Set device std::vector accs = hc::accelerator::get_all(); - auto current = accs[device_index]; + auto current = accs.at(device_index); hc::accelerator::set_default(current.get_device_path()); From c0559c2d863d0aa338bcb941b2ae460af03c9ff6 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 18 Apr 2017 15:03:24 +0200 Subject: [PATCH 17/23] fixed copy and paste error in error messages --- HCStream.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index b574f57..1fa620b 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -151,7 +151,7 @@ void HCStream::mul() future_kernel.wait(); } catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::mul " << e.what() << std::endl; throw; } } @@ -173,7 +173,7 @@ void HCStream::add() future_kernel.wait(); } catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::add " << e.what() << std::endl; throw; } } @@ -195,7 +195,7 @@ void HCStream::triad() future_kernel.wait(); } catch(std::exception& e){ - std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::copy " << e.what() << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream::triad " << e.what() << std::endl; throw; } } From 0a8176e2730660858df2b02ee906369e2a80feb9 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Thu, 4 May 2017 09:22:10 +0200 Subject: [PATCH 18/23] bumped to rocm 1.5 --- HC.make | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HC.make b/HC.make index 72eb68a..f4a9d41 100644 --- a/HC.make +++ b/HC.make @@ -1,8 +1,8 @@ HCC = hcc -CXXFLAGS+=-O3 -hc -stdlib=libc++ -I/opt/rocm/hcc-lc/include -LDFLAGS+=-L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive +CXXFLAGS+=-O3 -hc -I/opt/rocm/hcc/include #-stdlib=libc++ +LDFLAGS+=-L/opt/rocm/hcc/lib -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -ldl #-Wl,--rpath=/opt/rocm/hcc/lib -lc++ -lc++abi -ldl hc-stream: main.cpp HCStream.cpp $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ From bfe8996775c3c3138484862be07bb7702b5266cd Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Thu, 4 May 2017 09:30:51 +0200 Subject: [PATCH 19/23] switched to using hcc-config directly --- HC.make | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HC.make b/HC.make index f4a9d41..97c7701 100644 --- a/HC.make +++ b/HC.make @@ -1,8 +1,8 @@ HCC = hcc -CXXFLAGS+=-O3 -hc -I/opt/rocm/hcc/include #-stdlib=libc++ -LDFLAGS+=-L/opt/rocm/hcc/lib -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -ldl #-Wl,--rpath=/opt/rocm/hcc/lib -lc++ -lc++abi -ldl +CXXFLAGS+=-O3 $(shell hcc-config --cxxflags) +LDFLAGS+=$(shell hcc-config --ldflags) hc-stream: main.cpp HCStream.cpp $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ From 78a4f0cb7348615339b9e219da0f4ead642f7092 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 25 Jul 2017 17:04:51 +0200 Subject: [PATCH 20/23] upon call of HC.make, the size of the virtual size can be set --- HC.make | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/HC.make b/HC.make index 97c7701..6ac5301 100644 --- a/HC.make +++ b/HC.make @@ -4,8 +4,12 @@ HCC = hcc CXXFLAGS+=-O3 $(shell hcc-config --cxxflags) LDFLAGS+=$(shell hcc-config --ldflags) +ifdef TBSIZE +CXXFLAGS+=-DVIRTUALTILESIZE=$(TBSIZE) +endif + hc-stream: main.cpp HCStream.cpp - $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ + $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@ .PHONY: clean clean: From 6712e260356a34025a57acf8d64d6ba5145da2cb Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Tue, 25 Jul 2017 17:05:22 +0200 Subject: [PATCH 21/23] added ifndef for virtual tile size (default value found empirically on R9 Nano card) --- HCStream.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 1fa620b..47602fc 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -10,7 +10,9 @@ #include #include -#define TBSIZE 1024 +#ifndef VIRTUALTILESIZE +#define VIRTUALTILESIZE 1024 +#endif std::string getDeviceName(const hc::accelerator& _acc) { @@ -50,11 +52,11 @@ HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): d_c(ARRAY_SIZE) { - // The array size must be divisible by TBSIZE for kernel launches - if (ARRAY_SIZE % TBSIZE != 0) + // The array size must be divisible by VIRTUALTILESIZE for kernel launches + if (ARRAY_SIZE % VIRTUALTILESIZE != 0) { std::stringstream ss; - ss << "Array size must be a multiple of " << TBSIZE; + ss << "Array size must be a multiple of " << VIRTUALTILESIZE; throw std::runtime_error(ss.str()); } @@ -214,7 +216,7 @@ T HCStream::dot() const auto& view_b = this->d_b; auto ex = view_a.get_extent(); - const auto tiled_ex = hc::extent<1>(n_tiles * TBSIZE).tile(TBSIZE); + const auto tiled_ex = hc::extent<1>(n_tiles * VIRTUALTILESIZE).tile(VIRTUALTILESIZE); const auto domain_sz = tiled_ex.size(); hc::array partial(n_tiles); @@ -232,12 +234,12 @@ T HCStream::dot() gidx += domain_sz; } - tile_static T tileData[TBSIZE]; + tile_static T tileData[VIRTUALTILESIZE]; tileData[tidx.local[0]] = r; tidx.barrier.wait_with_tile_static_memory_fence(); - for (auto h = TBSIZE / 2; h; h /= 2) { + for (auto h = VIRTUALTILESIZE / 2; h; h /= 2) { if (tidx.local[0] < h) { tileData[tidx.local[0]] += tileData[tidx.local[0] + h]; } From 8509917dff738a5bfd26a3572cff72a94c8d15aa Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 31 Jul 2017 14:20:59 +0200 Subject: [PATCH 22/23] refactored n_tiles into preprocessor macro --- HCStream.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/HCStream.cpp b/HCStream.cpp index 47602fc..b1b4a9b 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2015-16 Peter Steinbach, MPI CBG Scientific Computing Facility +// Copyright (c) 2017 Peter Steinbach, MPI CBG Scientific Computing Facility // // For full license terms please see the LICENSE file distributed with this // source code @@ -10,10 +10,17 @@ #include #include +//specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115 #ifndef VIRTUALTILESIZE -#define VIRTUALTILESIZE 1024 +#define VIRTUALTILESIZE 256 #endif +//specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115 +#ifndef NTILES +#define NTILES 2048 +#endif + + std::string getDeviceName(const hc::accelerator& _acc) { std::wstring_convert, wchar_t> converter; @@ -210,16 +217,14 @@ T HCStream::dot() // ->Samples/CaseStudies/Reduction // ->CascadingReduction.h - static constexpr std::size_t n_tiles = 64; - const auto& view_a = this->d_a; const auto& view_b = this->d_b; auto ex = view_a.get_extent(); - const auto tiled_ex = hc::extent<1>(n_tiles * VIRTUALTILESIZE).tile(VIRTUALTILESIZE); + const auto tiled_ex = hc::extent<1>(NTILES * VIRTUALTILESIZE).tile(VIRTUALTILESIZE); const auto domain_sz = tiled_ex.size(); - hc::array partial(n_tiles); + hc::array partial(NTILES); hc::parallel_for_each(tiled_ex, [=, @@ -257,7 +262,7 @@ T HCStream::dot() throw; } - std::vector h_partial(n_tiles,0); + std::vector h_partial(NTILES,0); hc::copy(partial,h_partial.begin()); T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.); From 3b9052664f9e8e147e7998e2061c4736239253f7 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 31 Jul 2017 14:21:16 +0200 Subject: [PATCH 23/23] enable propagation of preprocessor macros from CLI to compiler command --- HC.make | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/HC.make b/HC.make index 6ac5301..a79acf4 100644 --- a/HC.make +++ b/HC.make @@ -8,6 +8,11 @@ ifdef TBSIZE CXXFLAGS+=-DVIRTUALTILESIZE=$(TBSIZE) endif +ifdef NTILES +CXXFLAGS+=-DNTILES=$(TBSIZE) +endif + + hc-stream: main.cpp HCStream.cpp $(HCC) $(CXXFLAGS) -DHC $^ $(LDFLAGS) $(EXTRA_FLAGS) -o $@