Add HIP managed memory support, resolves #162

This commit is contained in:
Tom Lin 2023-09-25 01:41:06 +01:00
parent bd6bb09b5d
commit 369785c96a
4 changed files with 49 additions and 4 deletions

View File

@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file.
### Added ### Added
- Ability to build Kokkos and RAJA versions against existing packages. - Ability to build Kokkos and RAJA versions against existing packages.
- Thrust managed memory. - Thrust managed memory.
- HIP managed memory.
- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`. - New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`.
### Changed ### Changed

View File

@ -289,6 +289,8 @@ build_hip() {
local name="hip_build" local name="hip_build"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}" run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?}"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=MANAGED"
run_build $name "${HIP_CXX:?}" hip "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DMEM=PAGEFAULT"
run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM" run_build $name "${GCC_CXX:?}" thrust "-DCMAKE_CXX_COMPILER=${HIP_CXX:?} -DSDK_DIR=$ROCM_PATH -DTHRUST_IMPL=ROCM"
} }

View File

@ -45,11 +45,22 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
// Print out device information // Print out device information
std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl;
std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl;
#if defined(MANAGED)
std::cout << "Memory: MANAGED" << std::endl;
#elif defined(PAGEFAULT)
std::cout << "Memory: PAGEFAULT" << std::endl;
#else
std::cout << "Memory: DEFAULT" << std::endl;
#endif
array_size = ARRAY_SIZE; array_size = ARRAY_SIZE;
// Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane)
dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane);
size_t array_bytes = sizeof(T);
array_bytes *= ARRAY_SIZE;
size_t total_bytes = array_bytes * 3;
// Allocate the host array for partial sums for dot kernels using hipHostMalloc. // Allocate the host array for partial sums for dot kernels using hipHostMalloc.
// This creates an array on the host which is visible to the device. However, it requires // This creates an array on the host which is visible to the device. However, it requires
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host // synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
@ -63,13 +74,26 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T))
throw std::runtime_error("Device does not have enough memory for all 3 buffers"); throw std::runtime_error("Device does not have enough memory for all 3 buffers");
// Create device buffers // Create device buffers
hipMalloc(&d_a, ARRAY_SIZE*sizeof(T)); #if defined(MANAGED)
hipMallocManaged(&d_a, array_bytes);
check_error(); check_error();
hipMalloc(&d_b, ARRAY_SIZE*sizeof(T)); hipMallocManaged(&d_b, array_bytes);
check_error(); check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T)); hipMallocManaged(&d_c, array_bytes);
check_error(); check_error();
#elif defined(PAGEFAULT)
d_a = (T*)malloc(array_bytes);
d_b = (T*)malloc(array_bytes);
d_c = (T*)malloc(array_bytes);
#else
hipMalloc(&d_a, array_bytes);
check_error();
hipMalloc(&d_b, array_bytes);
check_error();
hipMalloc(&d_c, array_bytes);
check_error();
#endif
} }
@ -109,13 +133,24 @@ void HIPStream<T>::init_arrays(T initA, T initB, T initC)
template <class T> template <class T>
void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) void HIPStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
{ {
// Copy device memory to host // Copy device memory to host
#if defined(PAGEFAULT) || defined(MANAGED)
hipDeviceSynchronize();
for (int i = 0; i < array_size; i++)
{
a[i] = d_a[i];
b[i] = d_b[i];
c[i] = d_c[i];
}
#else
hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error(); check_error();
hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error(); check_error();
hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost);
check_error(); check_error();
#endif
} }
template <typename T> template <typename T>

View File

@ -2,6 +2,13 @@
register_flag_required(CMAKE_CXX_COMPILER register_flag_required(CMAKE_CXX_COMPILER
"Absolute path to the AMD HIP C++ compiler") "Absolute path to the AMD HIP C++ compiler")
register_flag_optional(MEM "Device memory mode:
DEFAULT - allocate host and device memory pointers.
MANAGED - use HIP Managed Memory.
PAGEFAULT - shared memory, only host pointers allocated."
"DEFAULT")
macro(setup) macro(setup)
# nothing to do here as hipcc does everything correctly, what a surprise! # nothing to do here as hipcc does everything correctly, what a surprise!
register_definitions(${MEM})
endmacro() endmacro()