Add clarifying comment and further clean-up

This commit is contained in:
Thomas Gibson 2022-08-11 10:32:20 -05:00
parent f98aedf64d
commit de93c06e78
3 changed files with 4 additions and 6 deletions

View File

@ -27,8 +27,6 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
block_count(array_size / (TBSIZE * elements_per_lane)) block_count(array_size / (TBSIZE * elements_per_lane))
{ {
std::cerr << "Elements per lane: " << elements_per_lane << std::endl;
std::cerr << "Chunks per block: " << chunks_per_block << std::endl;
// The array size must be divisible by total number of elements // The array size must be divisible by total number of elements
// moved per block for kernel launches // moved per block for kernel launches
if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0) if (ARRAY_SIZE % (TBSIZE * elements_per_lane) != 0)
@ -39,7 +37,6 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
<< ")."; << ").";
throw std::runtime_error(ss.str()); throw std::runtime_error(ss.str());
} }
std::cerr << "block count " << block_count << std::endl;
// Set device // Set device
int count; int count;
@ -56,7 +53,10 @@ HIPStream<T>::HIPStream(const int ARRAY_SIZE, const int device_index)
array_size = ARRAY_SIZE; array_size = ARRAY_SIZE;
// Allocate the host array for partial sums for dot kernels // Allocate the host array for partial sums for dot kernels using hipHostMalloc.
// This creates an array on the host which is visible to the device. However, it requires
// synchronization (e.g. hipDeviceSynchronize) for the result to be available on the host
// after it has been passed through to a kernel.
hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent); hipHostMalloc(&sums, sizeof(T) * block_count, hipHostMallocNonCoherent);
check_error(); check_error();

View File

@ -55,7 +55,6 @@ class HIPStream : public Stream<T>
T *d_a; T *d_a;
T *d_b; T *d_b;
T *d_c; T *d_c;
T *d_sum;
public: public:

View File

@ -5,6 +5,5 @@ register_flag_required(CMAKE_CXX_COMPILER
register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4) register_flag_optional(DWORDS_PER_LANE "Flag indicating the number of dwords to process per wavefront lane." 4)
macro(setup) macro(setup)
# Ensure we set the proper preprocessor directives
register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE}) register_definitions(DWORDS_PER_LANE=${DWORDS_PER_LANE})
endmacro() endmacro()