From 369785c96a551693222b476a52be42ebb27933ef Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Mon, 25 Sep 2023 01:41:06 +0100 Subject: [PATCH] Add HIP managed memory support, resolves #162 --- CHANGELOG.md | 1 + src/ci-test-compile.sh | 2 ++ src/hip/HIPStream.cpp | 43 ++++++++++++++++++++++++++++++++++++++---- src/hip/model.cmake | 7 +++++++ 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c862837..605d327 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file. ### Added - Ability to build Kokkos and RAJA versions against existing packages. - Thrust managed memory. +- HIP managed memory. - New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. ### Changed diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 610c3f0..a67303c 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -289,6 +289,8 @@ build_hip() { local name="hip_build" run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=MANAGED" + run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=PAGEFAULT" run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" } diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 0db8485..56c46ed 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -45,11 +45,22 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) // Print out device information std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; +#else + std::cout << "Memory: DEFAULT" << std::endl; +#endif array_size = ARRAY_SIZE; // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 3; + // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host @@ -63,13 +74,26 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); - // Create device buffers - hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); + // Create device buffers +#if defined(MANAGED) + hipMallocManaged(&d_a, array_bytes); check_error(); - hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); + hipMallocManaged(&d_b, array_bytes); check_error(); - hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); + hipMallocManaged(&d_c, array_bytes); check_error(); +#elif defined(PAGEFAULT) + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); +#else + hipMalloc(&d_a, array_bytes); + check_error(); + hipMalloc(&d_b, array_bytes); + check_error(); + hipMalloc(&d_c, array_bytes); + check_error(); +#endif } @@ -109,13 +133,24 @@ void HIPStream::init_arrays(T initA, T initB, T initC) template void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { + // Copy device memory to host +#if defined(PAGEFAULT) || defined(MANAGED) + hipDeviceSynchronize(); + for (int i = 0; i < array_size; i++) + { + a[i] = d_a[i]; + b[i] = d_b[i]; + c[i] = d_c[i]; + } +#else hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); check_error(); +#endif } template diff --git a/src/hip/model.cmake b/src/hip/model.cmake index 78150c4..a63efec 100644 --- a/src/hip/model.cmake +++ b/src/hip/model.cmake @@ -2,6 +2,13 @@ register_flag_required(CMAKE_CXX_COMPILER "Absolute path to the AMD HIP C++ compiler") +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + MANAGED - use HIP Managed Memory. + PAGEFAULT - shared memory, only host pointers allocated." + "DEFAULT") + macro(setup) # nothing to do here as hipcc does everything correctly, what a surprise! + register_definitions(${MEM}) endmacro() \ No newline at end of file