Merge branch 'master' into cuda-memory
This commit is contained in:
commit
3bd65a0716
19
.gitignore
vendored
19
.gitignore
vendored
@ -1,18 +1,19 @@
|
||||
|
||||
gpu-stream-cuda
|
||||
gpu-stream-ocl
|
||||
gpu-stream-acc
|
||||
gpu-stream-omp3
|
||||
gpu-stream-omp45
|
||||
gpu-stream-sycl
|
||||
cuda-stream
|
||||
ocl-stream
|
||||
omp-stream
|
||||
acc-stream
|
||||
raja-stream
|
||||
kokkos-stream
|
||||
sycl-stream
|
||||
hip-stream
|
||||
|
||||
*.o
|
||||
*.bc
|
||||
*.sycl
|
||||
*.tar
|
||||
*.gz
|
||||
|
||||
.DS_Store
|
||||
|
||||
CMakeCache.txt
|
||||
CMakeFiles/
|
||||
cmake_install.cmake
|
||||
Makefile
|
||||
|
||||
198
CMakeLists.txt
198
CMakeLists.txt
@ -1,198 +0,0 @@
|
||||
|
||||
cmake_minimum_required(VERSION 3.2)
|
||||
|
||||
if(NOT DEFINED HIP_PATH)
|
||||
if(NOT DEFINED ENV{HIP_PATH})
|
||||
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
|
||||
else()
|
||||
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
|
||||
endif()
|
||||
endif()
|
||||
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
|
||||
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
project(gpu-stream)
|
||||
|
||||
include(CheckIncludeFileCXX)
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
set(gpu-stream_VERSION_MAJOR 2)
|
||||
set(gpu-stream_VERSION_MINOR 2)
|
||||
|
||||
configure_file(common.h.in common.h)
|
||||
include_directories(${CMAKE_BINARY_DIR})
|
||||
|
||||
# Use 'Release' if no build type specified
|
||||
if (NOT CMAKE_BUILD_TYPE)
|
||||
message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
endif()
|
||||
|
||||
# If using the Cray compiler, manually add the C++11 flag because setting the
|
||||
# standard through CMake as above doesn't set this flag with Cray
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
|
||||
list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
|
||||
endif ()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# HIP
|
||||
#-------------------------------------------------------------------------------
|
||||
find_package(HIP QUIET)
|
||||
if(${HIP_FOUND})
|
||||
list(APPEND HIP_HIPCC_FLAGS --std=c++11)
|
||||
hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu)
|
||||
target_compile_definitions(gpu-stream-hip PUBLIC HIP)
|
||||
else()
|
||||
message("Skipping HIP...")
|
||||
endif()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# CUDA
|
||||
#-------------------------------------------------------------------------------
|
||||
find_package(CUDA 7.0 QUIET)
|
||||
set(FLAG True)
|
||||
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
|
||||
execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
|
||||
if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
|
||||
message("Xcode version not supported by CUDA")
|
||||
set(FLAG False)
|
||||
endif ()
|
||||
endif ()
|
||||
if (${FLAG} AND ${CUDA_FOUND})
|
||||
list(APPEND CUDA_NVCC_FLAGS --std=c++11)
|
||||
cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
|
||||
target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
|
||||
else ()
|
||||
message("Skipping CUDA...")
|
||||
endif ()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# OpenCL
|
||||
#-------------------------------------------------------------------------------
|
||||
find_package(OpenCL QUIET)
|
||||
if (${OpenCL_FOUND})
|
||||
add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
|
||||
target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
|
||||
target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
|
||||
else ()
|
||||
message("Skipping OpenCL...")
|
||||
endif ()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# OpenACC
|
||||
#-------------------------------------------------------------------------------
|
||||
# Check compiler supports an OpenACC flag
|
||||
include(CheckCXXCompilerFlag)
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
|
||||
CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
|
||||
if (OPENACC)
|
||||
list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
|
||||
endif ()
|
||||
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
|
||||
CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
|
||||
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
|
||||
CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
|
||||
endif ()
|
||||
|
||||
if (OPENACC)
|
||||
add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
|
||||
target_compile_definitions(gpu-stream-acc PUBLIC ACC)
|
||||
else ()
|
||||
message("Skipping OpenACC...")
|
||||
endif ()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# OpenMP 3.0
|
||||
#-------------------------------------------------------------------------------
|
||||
find_package(OpenMP QUIET)
|
||||
if (${OpenMP_FOUND})
|
||||
add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
|
||||
target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
|
||||
else ()
|
||||
message("Skipping OpenMP 3...")
|
||||
endif ()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# OpenMP 4.5
|
||||
#-------------------------------------------------------------------------------
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
|
||||
add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
|
||||
target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# RAJA
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
if (RAJA_PATH)
|
||||
find_package(OpenMP)
|
||||
find_package(CUDA 7.5)
|
||||
list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
|
||||
list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
|
||||
list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
|
||||
list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
|
||||
cuda_include_directories(${RAJA_PATH}/include)
|
||||
set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
|
||||
cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
|
||||
target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
|
||||
target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
|
||||
else()
|
||||
message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
|
||||
endif()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Kokkos
|
||||
#-------------------------------------------------------------------------------
|
||||
if (KOKKOS_PATH)
|
||||
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
|
||||
add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
|
||||
else()
|
||||
message("Skipping Kokkos (requires Linux)")
|
||||
endif()
|
||||
else()
|
||||
message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
|
||||
endif()
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# SYCL
|
||||
#-------------------------------------------------------------------------------
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
|
||||
"${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
|
||||
# Use C++14 if available, otherwise drop back to C++11
|
||||
check_cxx_compiler_flag("-std=c++14" CXX14)
|
||||
if (CXX14)
|
||||
set(CMAKE_REQUIRED_FLAGS "-std=c++14")
|
||||
else()
|
||||
set(CMAKE_REQUIRED_FLAGS "-std=c++11")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
|
||||
if (HAS_SYCL)
|
||||
add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
|
||||
target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
|
||||
|
||||
# The user must define this in order to use FindComputeCpp
|
||||
if (COMPUTECPP_PACKAGE_ROOT_DIR)
|
||||
message(STATUS "Using ComputeCpp for SYCL compilation")
|
||||
include(FindComputeCpp)
|
||||
|
||||
include_directories(${COMPUTECPP_INCLUDE_DIRECTORY})
|
||||
|
||||
set(SOURCE_NAME "SYCLStream")
|
||||
|
||||
target_compile_options(gpu-stream-sycl PUBLIC ${HOST_COMPILER_OPTIONS})
|
||||
add_sycl_to_target(gpu-stream-sycl ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_NAME}.cpp
|
||||
${CMAKE_CURRENT_BINARY_DIR})
|
||||
else()
|
||||
message(STATUS "Using header-only SYCL implementation")
|
||||
set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
|
||||
endif()
|
||||
else ()
|
||||
message("Skipping SYCL...")
|
||||
endif (HAS_SYCL)
|
||||
10
CUDA.make
Normal file
10
CUDA.make
Normal file
@ -0,0 +1,10 @@
|
||||
CXXFLAGS=-O3
|
||||
CUDA_CXX=nvcc
|
||||
|
||||
cuda-stream: main.cpp CUDAStream.cu
|
||||
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f cuda-stream
|
||||
|
||||
@ -215,9 +215,7 @@ void CUDAStream<T>::triad()
|
||||
template <class T>
|
||||
__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
|
||||
{
|
||||
|
||||
extern __shared__ __align__(sizeof(T)) unsigned char smem[];
|
||||
T *tb_sum = reinterpret_cast<T*>(smem);
|
||||
__shared__ T tb_sum[TBSIZE];
|
||||
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const size_t local_i = threadIdx.x;
|
||||
@ -242,7 +240,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array
|
||||
template <class T>
|
||||
T CUDAStream<T>::dot()
|
||||
{
|
||||
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE, sizeof(T)*TBSIZE>>>(d_a, d_b, d_sum, array_size);
|
||||
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
|
||||
check_error();
|
||||
|
||||
#if defined(MANAGED) || defined(PAGEFAULT)
|
||||
|
||||
12
HIP.make
Normal file
12
HIP.make
Normal file
@ -0,0 +1,12 @@
|
||||
|
||||
# TODO: HIP with HCC
|
||||
|
||||
HIPCC = hipcc
|
||||
|
||||
hip-stream: main.cpp HIPStream.cpp
|
||||
$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f hip-stream
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#define TBSIZE 1024
|
||||
#define DOT_NUM_BLOCKS 256
|
||||
|
||||
void check_error(void)
|
||||
{
|
||||
@ -47,6 +48,9 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
|
||||
array_size = ARRAY_SIZE;
|
||||
|
||||
// Allocate the host array for partial sums for dot kernels
|
||||
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
|
||||
|
||||
// Check buffers fit on the device
|
||||
hipDeviceProp_t props;
|
||||
hipGetDeviceProperties(&props, 0);
|
||||
@ -60,20 +64,27 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
check_error();
|
||||
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
|
||||
check_error();
|
||||
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
template <class T>
|
||||
HIPStream<T>::~HIPStream()
|
||||
{
|
||||
free(sums);
|
||||
|
||||
hipFree(d_a);
|
||||
check_error();
|
||||
hipFree(d_b);
|
||||
check_error();
|
||||
hipFree(d_c);
|
||||
check_error();
|
||||
hipFree(d_sum);
|
||||
check_error();
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
|
||||
{
|
||||
@ -171,6 +182,46 @@ void HIPStream<T>::triad()
|
||||
check_error();
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
|
||||
{
|
||||
__shared__ T tb_sum[TBSIZE];
|
||||
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const size_t local_i = hipThreadIdx_x;
|
||||
|
||||
tb_sum[local_i] = 0.0;
|
||||
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
|
||||
tb_sum[local_i] += a[i] * b[i];
|
||||
|
||||
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
|
||||
{
|
||||
__syncthreads();
|
||||
if (local_i < offset)
|
||||
{
|
||||
tb_sum[local_i] += tb_sum[local_i+offset];
|
||||
}
|
||||
}
|
||||
|
||||
if (local_i == 0)
|
||||
sum[hipBlockIdx_x] = tb_sum[local_i];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T HIPStream<T>::dot()
|
||||
{
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
|
||||
check_error();
|
||||
|
||||
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
|
||||
check_error();
|
||||
|
||||
T sum = 0.0;
|
||||
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
|
||||
sum += sums[i];
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
void listDevices(void)
|
||||
{
|
||||
@ -21,10 +21,15 @@ class HIPStream : public Stream<T>
|
||||
protected:
|
||||
// Size of arrays
|
||||
unsigned int array_size;
|
||||
|
||||
// Host array for partial sums for dot kernel
|
||||
T *sums;
|
||||
|
||||
// Device side pointers to arrays
|
||||
T *d_a;
|
||||
T *d_b;
|
||||
T *d_c;
|
||||
T *d_sum;
|
||||
|
||||
|
||||
public:
|
||||
@ -36,6 +41,7 @@ class HIPStream : public Stream<T>
|
||||
virtual void add() override;
|
||||
virtual void mul() override;
|
||||
virtual void triad() override;
|
||||
virtual T dot() override;
|
||||
|
||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||
|
||||
@ -39,10 +39,10 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
View<double*, DEVICE> a(*d_a);
|
||||
View<double*, DEVICE> b(*d_b);
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
a[index] = initA;
|
||||
b[index] - initB;
|
||||
b[index] = initB;
|
||||
c[index] = initC;
|
||||
});
|
||||
Kokkos::fence();
|
||||
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
|
||||
View<double*, DEVICE> b(*d_b);
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
c[index] = a[index];
|
||||
});
|
||||
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
const T scalar = startScalar;
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
b[index] = scalar*c[index];
|
||||
});
|
||||
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
|
||||
View<double*, DEVICE> b(*d_b);
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
c[index] = a[index] + b[index];
|
||||
});
|
||||
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
|
||||
View<double*, DEVICE> c(*d_c);
|
||||
|
||||
const T scalar = startScalar;
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
||||
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||
{
|
||||
a[index] = b[index] + scalar*c[index];
|
||||
});
|
||||
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
|
||||
|
||||
T sum = 0.0;
|
||||
|
||||
parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
|
||||
parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
|
||||
{
|
||||
tmp += a[index] * b[index];
|
||||
}, sum);
|
||||
|
||||
45
Kokkos.make
Normal file
45
Kokkos.make
Normal file
@ -0,0 +1,45 @@
|
||||
|
||||
default: kokkos-stream
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifndef COMPILER
|
||||
define compiler_help
|
||||
Set COMPILER to change flags (defaulting to GNU).
|
||||
Available compilers are:
|
||||
GNU INTEL
|
||||
|
||||
endef
|
||||
$(info $(compiler_help))
|
||||
COMPILER=GNU
|
||||
endif
|
||||
|
||||
COMPILER_GNU = g++
|
||||
COMPILER_INTEL = icpc -qopt-streaming-stores=always
|
||||
CXX = $(COMPILER_$(COMPILER))
|
||||
|
||||
ifndef TARGET
|
||||
define target_help
|
||||
Set TARGET to change to offload device. Defaulting to CPU.
|
||||
Available targets are:
|
||||
CPU (default)
|
||||
GPU
|
||||
endef
|
||||
$(info $(target_help))
|
||||
TARGET=CPU
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), CPU)
|
||||
TARGET_DEF = -DKOKKOS_TARGET_CPU
|
||||
else ifeq ($(TARGET), GPU)
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
TARGET_DEF =
|
||||
endif
|
||||
|
||||
kokkos-stream: main.cpp KOKKOSStream.cpp $(KOKKOS_CPP_DEPENDS)
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(KOKKOS_LDFLAGS) main.cpp KOKKOSStream.cpp $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f kokkos-stream
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
|
||||
default: gpu-stream-kokkos
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
gpu-stream-kokkos: main.o KOKKOSStream.o
|
||||
$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
|
||||
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3
|
||||
@ -1,11 +0,0 @@
|
||||
|
||||
default: gpu-stream-kokkos
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
gpu-stream-kokkos: main.o KOKKOSStream.o
|
||||
$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3
|
||||
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
|
||||
$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3
|
||||
|
||||
14
LICENSE
14
LICENSE
@ -12,22 +12,22 @@
|
||||
* 3. You are free to publish results obtained from running this
|
||||
* program, or from works that you derive from this program,
|
||||
* with the following limitations:
|
||||
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
|
||||
* published results must be in conformance to the GPU-STREAM
|
||||
* 3a. In order to be referred to as "BabelStream benchmark results",
|
||||
* published results must be in conformance to the BabelStream
|
||||
* Run Rules published at
|
||||
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
|
||||
* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
|
||||
* and incorporated herein by reference.
|
||||
* The copyright holders retain the
|
||||
* right to determine conformity with the Run Rules.
|
||||
* 3b. Results based on modified source code or on runs not in
|
||||
* accordance with the GPU-STREAM Run Rules must be clearly
|
||||
* accordance with the BabelStream Run Rules must be clearly
|
||||
* labelled whenever they are published. Examples of
|
||||
* proper labelling include:
|
||||
* "tuned GPU-STREAM benchmark results"
|
||||
* "based on a variant of the GPU-STREAM benchmark code"
|
||||
* "tuned BabelStream benchmark results"
|
||||
* "based on a variant of the BabelStream benchmark code"
|
||||
* Other comparable, clear and reasonable labelling is
|
||||
* acceptable.
|
||||
* 3c. Submission of results to the GPU-STREAM benchmark web site
|
||||
* 3c. Submission of results to the BabelStream benchmark web site
|
||||
* is encouraged, but not required.
|
||||
* 4. Use of this program or creation of derived works based on this
|
||||
* program constitutes acceptance of these licensing restrictions.
|
||||
|
||||
@ -7,6 +7,10 @@
|
||||
|
||||
#include "OMPStream.h"
|
||||
|
||||
#ifndef ALIGNMENT
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
|
||||
{
|
||||
@ -22,9 +26,9 @@ OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev
|
||||
{}
|
||||
#else
|
||||
// Allocate on the host
|
||||
this->a = (T*)malloc(sizeof(T)*array_size);
|
||||
this->b = (T*)malloc(sizeof(T)*array_size);
|
||||
this->c = (T*)malloc(sizeof(T)*array_size);
|
||||
this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
52
OpenACC.make
Normal file
52
OpenACC.make
Normal file
@ -0,0 +1,52 @@
|
||||
|
||||
ifndef COMPILER
|
||||
define compiler_help
|
||||
Set COMPILER to ensure correct flags are set.
|
||||
Available compilers are:
|
||||
PGI CRAY
|
||||
endef
|
||||
$(info $(compiler_help))
|
||||
endif
|
||||
|
||||
COMPILER_ = $(CXX)
|
||||
COMPILER_PGI = pgc++
|
||||
COMPILER_CRAY = CC
|
||||
|
||||
FLAGS_ = -O3 -std=c++11
|
||||
|
||||
FLAGS_PGI = -std=c++11 -O3 -acc
|
||||
ifeq ($(COMPILER), PGI)
|
||||
define target_help
|
||||
Set a TARGET to ensure PGI targets the correct offload device.
|
||||
Available targets are:
|
||||
SNB, IVB, HSW
|
||||
KEPLER, MAXWELL, PASCAL
|
||||
HAWAII
|
||||
endef
|
||||
ifndef TARGET
|
||||
$(error $(target_help))
|
||||
endif
|
||||
TARGET_FLAGS_SNB = -ta=multicore -tp=sandybridge
|
||||
TARGET_FLAGS_IVB = -ta=multicore -tp=ivybridge
|
||||
TARGET_FLAGS_HSW = -ta=multicore -tp=haswell
|
||||
TARGET_FLAGS_KEPLER = -ta=nvidia:cc35
|
||||
TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
|
||||
TARGET_FLAGS_PASCAL = -ta=nvidia:cc60
|
||||
TARGET_FLAGS_HAWAII = -ta=radeon:hawaii
|
||||
ifeq ($(TARGET_FLAGS_$(TARGET)),)
|
||||
$(error $(target_help))
|
||||
endif
|
||||
|
||||
FLAGS_PGI += $(TARGET_FLAGS_$(TARGET))
|
||||
|
||||
endif
|
||||
|
||||
FLAGS_CRAY = -hstd=c++11
|
||||
CXXFLAGS = $(FLAGS_$(COMPILER))
|
||||
|
||||
acc-stream: main.cpp ACCStream.cpp
|
||||
$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f acc-stream main.o ACCStream.o
|
||||
39
OpenCL.make
Normal file
39
OpenCL.make
Normal file
@ -0,0 +1,39 @@
|
||||
|
||||
ifndef COMPILER
|
||||
define compiler_help
|
||||
Set COMPILER to change flags (defaulting to GNU).
|
||||
Available compilers are:
|
||||
GNU CLANG INTEL CRAY
|
||||
|
||||
endef
|
||||
$(info $(compiler_help))
|
||||
COMPILER=GNU
|
||||
endif
|
||||
|
||||
COMPILER_GNU = g++
|
||||
COMPILER_CLANG = clang++
|
||||
COMPILER_INTEL = icpc
|
||||
COMPILER_CRAY = CC
|
||||
CXX = $(COMPILER_$(COMPILER))
|
||||
|
||||
FLAGS_ = -O3 -std=c++11
|
||||
FLAGS_GNU = -O3 -std=c++11
|
||||
FLAGS_CLANG = -O3 -std=c++11
|
||||
FLAGS_INTEL = -O3 -std=c++11
|
||||
FLAGS_CRAY = -O3 -hstd=c++11
|
||||
CXXFLAGS=$(FLAGS_$(COMPILER))
|
||||
|
||||
PLATFORM = $(shell uname -s)
|
||||
ifeq ($(PLATFORM), Darwin)
|
||||
LIBS = -framework OpenCL
|
||||
else
|
||||
LIBS = -lOpenCL
|
||||
endif
|
||||
|
||||
ocl-stream: main.cpp OCLStream.cpp
|
||||
$(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f ocl-stream
|
||||
|
||||
60
OpenMP.make
Normal file
60
OpenMP.make
Normal file
@ -0,0 +1,60 @@
|
||||
|
||||
ifndef COMPILER
|
||||
define compiler_help
|
||||
Set COMPILER to change flags (defaulting to GNU).
|
||||
Available compilers are:
|
||||
CLANG CRAY GNU INTEL XL
|
||||
|
||||
endef
|
||||
$(info $(compiler_help))
|
||||
COMPILER=GNU
|
||||
endif
|
||||
|
||||
ifndef TARGET
|
||||
define target_help
|
||||
Set TARGET to change device (defaulting to CPU).
|
||||
Available targets are:
|
||||
CPU NVIDIA
|
||||
|
||||
endef
|
||||
$(info $(target_help))
|
||||
TARGET=CPU
|
||||
endif
|
||||
|
||||
COMPILER_GNU = g++
|
||||
COMPILER_INTEL = icpc
|
||||
COMPILER_CRAY = CC
|
||||
COMPILER_CLANG = clang++
|
||||
COMPILER_XL = xlc++
|
||||
CXX = $(COMPILER_$(COMPILER))
|
||||
|
||||
FLAGS_GNU = -O3 -std=c++11
|
||||
FLAGS_INTEL = -O3 -std=c++11 -xHOST -qopt-streaming-stores=always
|
||||
FLAGS_CRAY = -O3 -hstd=c++11
|
||||
FLAGS_CLANG = -O3 -std=c++11
|
||||
FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11
|
||||
CXXFLAGS = $(FLAGS_$(COMPILER))
|
||||
|
||||
# OpenMP flags for CPUs
|
||||
OMP_GNU_CPU = -fopenmp
|
||||
OMP_INTEL_CPU = -qopenmp
|
||||
OMP_CRAY_CPU = -homp
|
||||
OMP_CLANG_CPU = -fopenmp=libomp
|
||||
OMP_XL_CPU = -qsmp=omp -qthreaded
|
||||
|
||||
# OpenMP flags for NVIDIA
|
||||
OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU
|
||||
OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
|
||||
|
||||
ifndef OMP_$(COMPILER)_$(TARGET)
|
||||
$(error Targeting $(TARGET) with $(COMPILER) not supported)
|
||||
endif
|
||||
|
||||
OMP = $(OMP_$(COMPILER)_$(TARGET))
|
||||
|
||||
omp-stream: main.cpp OMPStream.cpp
|
||||
$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f omp-stream
|
||||
58
RAJA.make
Normal file
58
RAJA.make
Normal file
@ -0,0 +1,58 @@
|
||||
|
||||
ifndef TARGET
|
||||
define target_help
|
||||
Set TARGET to change to offload device. Defaulting to CPU.
|
||||
Available targets are:
|
||||
CPU (default)
|
||||
GPU
|
||||
endef
|
||||
$(info $(target_help))
|
||||
TARGET=CPU
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), CPU)
|
||||
|
||||
ifndef COMPILER
|
||||
define compiler_help
|
||||
Set COMPILER to change flags (defaulting to GNU).
|
||||
Available compilers are:
|
||||
INTEL GNU CRAY XL
|
||||
endef
|
||||
$(info $(compiler_help))
|
||||
COMPILER=GNU
|
||||
endif
|
||||
|
||||
CXX_INTEL = icpc
|
||||
CXX_GNU = g++
|
||||
CXX_CRAY = CC
|
||||
CXX_XL = xlc++
|
||||
|
||||
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
|
||||
CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp
|
||||
CXXFLAGS_CRAY = -O3 -hstd=c++11
|
||||
CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
|
||||
|
||||
CXX = $(CXX_$(COMPILER))
|
||||
CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER))
|
||||
|
||||
else ifeq ($(TARGET), GPU)
|
||||
CXX = nvcc
|
||||
|
||||
ifndef ARCH
|
||||
define arch_help
|
||||
Set ARCH to ensure correct GPU architecture.
|
||||
Example:
|
||||
ARCH=sm_35
|
||||
endef
|
||||
$(error $(arch_help))
|
||||
endif
|
||||
CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
|
||||
endif
|
||||
|
||||
raja-stream: main.cpp RAJAStream.cpp
|
||||
$(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f raja-stream
|
||||
|
||||
@ -10,6 +10,10 @@
|
||||
using RAJA::forall;
|
||||
using RAJA::RangeSegment;
|
||||
|
||||
#ifndef ALIGNMENT
|
||||
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
: array_size(ARRAY_SIZE)
|
||||
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||
index_set.push_back(seg);
|
||||
|
||||
#ifdef RAJA_TARGET_CPU
|
||||
d_a = new T[ARRAY_SIZE];
|
||||
d_b = new T[ARRAY_SIZE];
|
||||
d_c = new T[ARRAY_SIZE];
|
||||
d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||
#else
|
||||
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||
@ -33,9 +37,9 @@ template <class T>
|
||||
RAJAStream<T>::~RAJAStream()
|
||||
{
|
||||
#ifdef RAJA_TARGET_CPU
|
||||
delete[] d_a;
|
||||
delete[] d_b;
|
||||
delete[] d_c;
|
||||
free(d_a);
|
||||
free(d_b);
|
||||
free(d_c);
|
||||
#else
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_b);
|
||||
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
|
||||
template <class T>
|
||||
void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
a[index] = initA;
|
||||
b[index] = initB;
|
||||
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
|
||||
template <class T>
|
||||
void RAJAStream<T>::copy()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
c[index] = a[index];
|
||||
});
|
||||
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
|
||||
template <class T>
|
||||
void RAJAStream<T>::mul()
|
||||
{
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
const T scalar = startScalar;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
b[index] = scalar*c[index];
|
||||
});
|
||||
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
|
||||
template <class T>
|
||||
void RAJAStream<T>::add()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
c[index] = a[index] + b[index];
|
||||
});
|
||||
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
|
||||
template <class T>
|
||||
void RAJAStream<T>::triad()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* c = d_c;
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
T* RAJA_RESTRICT c = d_c;
|
||||
const T scalar = startScalar;
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
a[index] = b[index] + scalar*c[index];
|
||||
});
|
||||
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
|
||||
template <class T>
|
||||
T RAJAStream<T>::dot()
|
||||
{
|
||||
T* a = d_a;
|
||||
T* b = d_b;
|
||||
T* RAJA_RESTRICT a = d_a;
|
||||
T* RAJA_RESTRICT b = d_b;
|
||||
|
||||
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
|
||||
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
||||
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||
{
|
||||
sum += a[index] * b[index];
|
||||
});
|
||||
|
||||
36
README.android
Normal file
36
README.android
Normal file
@ -0,0 +1,36 @@
|
||||
Android (outdated instructions)
|
||||
------------------
|
||||
|
||||
Assuming you have a recent Android NDK available, you can use the
|
||||
toolchain that it provides to build GPU-STREAM. You should first
|
||||
use the NDK to generate a standalone toolchain:
|
||||
|
||||
# Select a directory to install the toolchain to
|
||||
ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
|
||||
|
||||
${NDK}/build/tools/make-standalone-toolchain.sh \
|
||||
--platform=android-14 \
|
||||
--toolchain=arm-linux-androideabi-4.8 \
|
||||
--install-dir=${ANDROID_NATIVE_TOOLCHAIN}
|
||||
|
||||
Make sure that the OpenCL headers and library (libOpenCL.so) are
|
||||
available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
|
||||
|
||||
You should then be able to build GPU-STREAM:
|
||||
|
||||
make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
|
||||
|
||||
Copy the executable and OpenCL kernels to the device:
|
||||
|
||||
adb push gpu-stream-ocl /data/local/tmp
|
||||
adb push ocl-stream-kernels.cl /data/local/tmp
|
||||
|
||||
Run GPU-STREAM from an adb shell:
|
||||
|
||||
adb shell
|
||||
cd /data/local/tmp
|
||||
|
||||
# Use float if device doesn't support double, and reduce array size
|
||||
./gpu-stream-ocl --float -n 6 -s 10000000
|
||||
|
||||
|
||||
74
README.md
74
README.md
@ -1,4 +1,4 @@
|
||||
GPU-STREAM
|
||||
BabelStream
|
||||
==========
|
||||
|
||||
Measure memory transfer rates to/from global device memory on GPUs.
|
||||
@ -16,53 +16,51 @@ Currently implemented are:
|
||||
- RAJA
|
||||
- SYCL
|
||||
|
||||
This code was previously called GPU-STREAM.
|
||||
|
||||
Website
|
||||
-------
|
||||
[uob-hpc.github.io/GPU-STREAM/](uob-hpc.github.io/GPU-STREAM/)
|
||||
[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
CMake 3.2 or above is required.
|
||||
Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
|
||||
Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
|
||||
|
||||
Generate the Makefile with `cmake .`
|
||||
We have supplied a series of Makefiles, one for each programming model, to assist with building.
|
||||
The Makefiles contain common build options, and should be simple to customise for your needs too.
|
||||
|
||||
Android (outdated instructions)
|
||||
------------------
|
||||
General usage is `make -f <Model>.make`
|
||||
Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
|
||||
Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.
|
||||
|
||||
Assuming you have a recent Android NDK available, you can use the
|
||||
toolchain that it provides to build GPU-STREAM. You should first
|
||||
use the NDK to generate a standalone toolchain:
|
||||
Pass in extra flags via the `EXTRA_FLAGS` option.
|
||||
|
||||
# Select a directory to install the toolchain to
|
||||
ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
|
||||
The binaries are named in the form `<model>-stream`.
|
||||
|
||||
${NDK}/build/tools/make-standalone-toolchain.sh \
|
||||
--platform=android-14 \
|
||||
--toolchain=arm-linux-androideabi-4.8 \
|
||||
--install-dir=${ANDROID_NATIVE_TOOLCHAIN}
|
||||
Building Kokkos
|
||||
---------------
|
||||
|
||||
Make sure that the OpenCL headers and library (libOpenCL.so) are
|
||||
available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
|
||||
We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
|
||||
```
|
||||
../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
|
||||
```
|
||||
For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
|
||||
```
|
||||
../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
|
||||
```
|
||||
|
||||
You should then be able to build GPU-STREAM:
|
||||
|
||||
make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
|
||||
|
||||
Copy the executable and OpenCL kernels to the device:
|
||||
|
||||
adb push gpu-stream-ocl /data/local/tmp
|
||||
adb push ocl-stream-kernels.cl /data/local/tmp
|
||||
|
||||
Run GPU-STREAM from an adb shell:
|
||||
|
||||
adb shell
|
||||
cd /data/local/tmp
|
||||
|
||||
# Use float if device doesn't support double, and reduce array size
|
||||
./gpu-stream-ocl --float -n 6 -s 10000000
|
||||
Building RAJA
|
||||
-------------
|
||||
|
||||
We use the following command to build RAJA using the Intel Compiler.
|
||||
```
|
||||
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
|
||||
```
|
||||
For building with CUDA support, we use the following command.
|
||||
```
|
||||
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
|
||||
```
|
||||
|
||||
Results
|
||||
-------
|
||||
@ -72,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
|
||||
Citing
|
||||
------
|
||||
|
||||
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
|
||||
Please cite BabelStream via this reference:
|
||||
|
||||
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
|
||||
|
||||
**Other GPU-STREAM publications:**
|
||||
**Other BabelStream publications:**
|
||||
|
||||
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
|
||||
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
|
||||
|
||||
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
|
||||
You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
|
||||
|
||||
|
||||
|
||||
|
||||
12
SYCL.make
Normal file
12
SYCL.make
Normal file
@ -0,0 +1,12 @@
|
||||
|
||||
COMPUTECPP_FLAGS = $(shell computecpp_info --dump-device-compiler-flags)
|
||||
|
||||
sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
|
||||
$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@
|
||||
|
||||
SYCLStream.sycl: SYCLStream.cpp
|
||||
compute++ SYCLStream.cpp $(COMPUTECPP_FLAGS) -c
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -f sycl-stream SYCLStream.sycl SYCLStream.bc
|
||||
@ -1,248 +0,0 @@
|
||||
#.rst:
|
||||
# FindComputeCpp
|
||||
#---------------
|
||||
#
|
||||
# Copyright 2016 Codeplay Software Ltd.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use these files except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#########################
|
||||
# FindComputeCpp.cmake
|
||||
#########################
|
||||
#
|
||||
# Tools for finding and building with ComputeCpp.
|
||||
#
|
||||
# User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
|
||||
# installation.
|
||||
#
|
||||
# Latest version of this file can be found at:
|
||||
# https://github.com/codeplaysoftware/computecpp-sdk
|
||||
|
||||
# Require CMake version 3.2.2 or higher
|
||||
cmake_minimum_required(VERSION 3.2.2)
|
||||
|
||||
# Check that a supported host compiler can be found
|
||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
||||
# Require at least gcc 4.8
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
|
||||
message(FATAL_ERROR
|
||||
"host compiler - Not found! (gcc version must be at least 4.8)")
|
||||
# Require the GCC dual ABI to be disabled for 5.1 or higher
|
||||
elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
|
||||
set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
|
||||
message(STATUS
|
||||
"host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
|
||||
else()
|
||||
message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
|
||||
endif()
|
||||
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
||||
# Require at least clang 3.6
|
||||
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
|
||||
message(FATAL_ERROR
|
||||
"host compiler - Not found! (clang version must be at least 3.6)")
|
||||
else()
|
||||
message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING
|
||||
"host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
|
||||
endif()
|
||||
|
||||
set(COMPUTECPP_64_BIT_DEFAULT ON)
|
||||
option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
|
||||
${COMPUTECPP_64_BIT_DEFAULT})
|
||||
mark_as_advanced(COMPUTECPP_64_BIT_CODE)
|
||||
|
||||
# Find OpenCL package
|
||||
find_package(OpenCL REQUIRED)
|
||||
|
||||
# Find ComputeCpp packagee
|
||||
if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
|
||||
message(FATAL_ERROR
|
||||
"ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
|
||||
else()
|
||||
message(STATUS "ComputeCpp package - Found")
|
||||
endif()
|
||||
option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
|
||||
|
||||
# Obtain the path to compute++
|
||||
find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
|
||||
${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
|
||||
if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
|
||||
mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
|
||||
message(STATUS "compute++ - Found")
|
||||
else()
|
||||
message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
|
||||
endif()
|
||||
|
||||
# Obtain the path to computecpp_info
|
||||
find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
|
||||
${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
|
||||
if (EXISTS ${COMPUTECPP_INFO_TOOL})
|
||||
mark_as_advanced(${COMPUTECPP_INFO_TOOL})
|
||||
message(STATUS "computecpp_info - Found")
|
||||
else()
|
||||
message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
|
||||
endif()
|
||||
|
||||
# Obtain the path to the ComputeCpp runtime library
|
||||
find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
|
||||
HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
|
||||
DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
|
||||
|
||||
if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
|
||||
mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
|
||||
message(STATUS "libComputeCpp.so - Found")
|
||||
else()
|
||||
message(FATAL_ERROR "libComputeCpp.so - Not found!")
|
||||
endif()
|
||||
|
||||
# Obtain the ComputeCpp include directory
|
||||
set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
|
||||
if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
|
||||
message(FATAL_ERROR "ComputeCpp includes - Not found!")
|
||||
else()
|
||||
message(STATUS "ComputeCpp includes - Found")
|
||||
endif()
|
||||
|
||||
# Obtain the package version
|
||||
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
|
||||
OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
|
||||
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
|
||||
message(FATAL_ERROR "Package version - Error obtaining version!")
|
||||
else()
|
||||
mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
|
||||
message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
|
||||
endif()
|
||||
|
||||
# Obtain the device compiler flags
|
||||
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
|
||||
OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
|
||||
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
|
||||
message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
|
||||
else()
|
||||
mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
|
||||
message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
|
||||
endif()
|
||||
|
||||
# Check if the platform is supported
|
||||
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
|
||||
OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
|
||||
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
|
||||
message(FATAL_ERROR "platform - Error checking platform support!")
|
||||
else()
|
||||
mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
|
||||
if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
|
||||
message(STATUS "platform - your system can support ComputeCpp")
|
||||
else()
|
||||
message(STATUS "platform - your system CANNOT support ComputeCpp")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
####################
|
||||
# __build_sycl
|
||||
####################
|
||||
#
|
||||
# Adds a custom target for running compute++ and adding a dependency for the
|
||||
# resulting integration header.
|
||||
#
|
||||
# targetName : Name of the target.
|
||||
# sourceFile : Source file to be compiled.
|
||||
# binaryDir : Intermediate directory to output the integration header.
|
||||
#
|
||||
function(__build_spir targetName sourceFile binaryDir)
|
||||
|
||||
# Retrieve source file name.
|
||||
get_filename_component(sourceFileName ${sourceFile} NAME)
|
||||
|
||||
# Set the path to the Sycl file.
|
||||
set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
|
||||
|
||||
# Add any user-defined include to the device compiler
|
||||
get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
|
||||
INCLUDE_DIRECTORIES)
|
||||
set(device_compiler_includes "")
|
||||
foreach(directory ${includeDirectories})
|
||||
set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
|
||||
endforeach()
|
||||
if (CMAKE_INCLUDE_PATH)
|
||||
foreach(directory ${CMAKE_INCLUDE_PATH})
|
||||
set(device_compiler_includes "-I${directory}"
|
||||
${device_compiler_includes})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# Convert argument list format
|
||||
separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
|
||||
|
||||
# Add custom command for running compute++
|
||||
add_custom_command(
|
||||
OUTPUT ${outputSyclFile}
|
||||
COMMAND ${COMPUTECPP_DEVICE_COMPILER}
|
||||
${COMPUTECPP_DEVICE_COMPILER_FLAGS}
|
||||
-isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
|
||||
${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
|
||||
${device_compiler_includes}
|
||||
-o ${outputSyclFile}
|
||||
-c ${sourceFile}
|
||||
DEPENDS ${sourceFile}
|
||||
WORKING_DIRECTORY ${binaryDir}
|
||||
COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
|
||||
|
||||
# Add a custom target for the generated integration header
|
||||
add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
|
||||
|
||||
# Add a dependency on the integration header
|
||||
add_dependencies(${targetName} ${targetName}_integration_header)
|
||||
|
||||
# Force inclusion of the integration header for the host compiler
|
||||
set(compileFlags -include ${outputSyclFile} "-Wall")
|
||||
target_compile_options(${targetName} PUBLIC ${compileFlags})
|
||||
|
||||
# Set the host compiler C++ standard to C++11
|
||||
set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
|
||||
|
||||
# Disable GCC dual ABI on GCC 5.1 and higher
|
||||
if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
|
||||
set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
|
||||
"_GLIBCXX_USE_CXX11_ABI=0")
|
||||
endif()
|
||||
|
||||
endfunction()
|
||||
|
||||
#######################
|
||||
# add_sycl_to_target
|
||||
#######################
|
||||
#
|
||||
# Adds a SYCL compilation custom command associated with an existing
|
||||
# target and sets a dependancy on that new command.
|
||||
#
|
||||
# targetName : Name of the target to add a SYCL to.
|
||||
# sourceFile : Source file to be compiled for SYCL.
|
||||
# binaryDir : Intermediate directory to output the integration header.
|
||||
#
|
||||
function(add_sycl_to_target targetName sourceFile binaryDir)
|
||||
|
||||
# Add custom target to run compute++ and generate the integration header
|
||||
__build_spir(${targetName} ${sourceFile} ${binaryDir})
|
||||
|
||||
# Link with the ComputeCpp runtime library
|
||||
target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
|
||||
PUBLIC ${OpenCL_LIBRARIES})
|
||||
|
||||
endfunction(add_sycl_to_target)
|
||||
|
||||
4
main.cpp
4
main.cpp
@ -15,7 +15,7 @@
|
||||
#include <iomanip>
|
||||
#include <cstring>
|
||||
|
||||
#define VERSION_STRING "devel"
|
||||
#define VERSION_STRING "3.2"
|
||||
|
||||
#include "Stream.h"
|
||||
|
||||
@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]);
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
std::cout
|
||||
<< "GPU-STREAM" << std::endl
|
||||
<< "BabelStream" << std::endl
|
||||
<< "Version: " << VERSION_STRING << std::endl
|
||||
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user