diff --git a/CMakeLists.txt b/CMakeLists.txt index 5551c54..c2bc5d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,24 +58,6 @@ else() message("Skipping HIP...") endif() -#------------------------------------------------------------------------------- -# HCC -#------------------------------------------------------------------------------- -find_program(HCC_BINARY hcc HINTS ${HCC_PATH}/bin PATHS ${HCC_PATH}/bin) -if(EXISTS ${HCC_BINARY}) - #can the following be tied to the target only? - set(CMAKE_CXX_COMPILER ${HCC_BINARY}) - # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include) - # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small ) - add_executable(gpu-stream-hc main.cpp HCStream.cpp) - set_target_properties(gpu-stream-hc PROPERTIES - COMPILE_FLAGS "-hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" - LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small" - ) - target_compile_definitions(gpu-stream-hc PUBLIC HC) -else() - message("Skipping HC...") -endif() #------------------------------------------------------------------------------- @@ -110,6 +92,9 @@ else () message("Skipping OpenCL...") endif () + + + #------------------------------------------------------------------------------- # OpenACC #------------------------------------------------------------------------------- @@ -187,6 +172,37 @@ else() message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)") endif() +#------------------------------------------------------------------------------- +# HCC +#------------------------------------------------------------------------------- +find_program(HCC_BINARY hcc HINTS ${HCC_PATH}/bin PATHS ${HCC_PATH}/bin) +if(EXISTS ${HCC_BINARY}) + #can the following be tied to the target only? + set(OLD_CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}) + set(CMAKE_CXX_COMPILER ${HCC_BINARY}) + + # list(APPEND CMAKE_CXX_FLAGS -hc -I /usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include) + # list(APPEND CMAKE_EXE_LINKER_FLAGS -hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small ) + add_executable(gpu-stream-hc main.cpp HCStream.cpp) + set_target_properties(gpu-stream-hc PROPERTIES + COMPILE_FLAGS "-hc -I/usr/include/c++/v1 -std=c++11 -stdlib=libc++ -I/opt/rocm/hcc-lc/include" + LINK_FLAGS "-hc -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -mcmodel=small" + ) + message("OpenCL magic: ${OpenCL_LIBRARY} ${OpenCL_INCLUDE_DIR}") + set_property(TARGET gpu-stream-ocl APPEND PROPERTY COMPILE_FLAGS "-I/opt/rocm/opencl/include/ -stdlib=libc++ -I/opt/rocm/hcc-lc/include") + set_property(TARGET gpu-stream-ocl APPEND PROPERTY LINK_FLAGS "-L/opt/rocm/opencl/lib/x86_64 -std=c++11 -L/opt/rocm/hcc-lc/lib -Wl,--rpath=/opt/rocm/hcc-lc/lib -lc++ -lc++abi ") + # set_target_properties(gpu-stream-ocl PROPERTIES + # # COMPILE_FLAGS "-I/opt/rocm/opencl/include/opencl1.2" + # LINK_FLAGS "-I /opt/rocm/opencl/include/opencl1.2 -L /opt/rocm/opencl/lib/x86_64 -lOpenCL -std=c++11" + # ) + target_compile_definitions(gpu-stream-hc PUBLIC HC) + if(TARGET gpu-stream-ocl) + unset(gpu-stream-ocl) + endif() +else() + message("Skipping HC...") +endif() + #------------------------------------------------------------------------------- # SYCL #------------------------------------------------------------------------------- diff --git a/HCStream.cpp b/HCStream.cpp index 0c55af3..8cea1dc 100644 --- a/HCStream.cpp +++ b/HCStream.cpp @@ -72,7 +72,9 @@ HCStream::HCStream(const unsigned int ARRAY_SIZE, const int device_index): // // Set device std::vector accs = hc::accelerator::get_all(); auto current = accs[device_index]; - + + hc::accelerator::set_default(current.get_device_path()); + std::cout << "Using HC device " << getDeviceName(current) << std::endl; // // The array size must be divisible by TBSIZE for kernel launches @@ -142,11 +144,15 @@ void HCStream::read_arrays(std::vector& a, std::vector& b, std::vector< template void HCStream::copy() { + + hc::array& device_a = this->d_a; + hc::array& device_c = this->d_c; + try{ - // launch a GPU kernel to compute the saxpy in parallel + // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) - , [&](hc::index<1> i) [[hc]] { - d_c[i] = d_a[i]; + , [&](hc::index<1> index) [[hc]] { + device_c[index] = device_a[index]; }); future_kernel.wait(); } @@ -160,11 +166,14 @@ template void HCStream::mul() { const T scalar = 0.3; + hc::array& device_b = this->d_b; + hc::array& device_c = this->d_c; + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [&](hc::index<1> i) [[hc]] { - d_b[i] = scalar*d_c[i]; + device_b[i] = scalar*device_c[i]; }); future_kernel.wait(); } @@ -177,11 +186,16 @@ void HCStream::mul() template void HCStream::add() { + + hc::array& device_a = this->d_a; + hc::array& device_b = this->d_b; + hc::array& device_c = this->d_c; + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [&](hc::index<1> i) [[hc]] { - d_c[i] = d_a[i]+d_b[i]; + device_c[i] = device_a[i]+device_b[i]; }); future_kernel.wait(); } @@ -195,11 +209,15 @@ template void HCStream::triad() { const T scalar = 0.3; + hc::array& device_a = this->d_a; + hc::array& device_b = this->d_b; + hc::array& device_c = this->d_c; + try{ // launch a GPU kernel to compute the saxpy in parallel hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size) , [&](hc::index<1> i) [[hc]] { - d_a[i] = d_b[i] + scalar*d_c[i]; + device_a[i] = device_b[i] + scalar*device_c[i]; }); future_kernel.wait(); }