Merge branch 'master' into cuda-memory
This commit is contained in:
commit
3bd65a0716
19
.gitignore
vendored
19
.gitignore
vendored
@ -1,18 +1,19 @@
|
|||||||
|
|
||||||
gpu-stream-cuda
|
cuda-stream
|
||||||
gpu-stream-ocl
|
ocl-stream
|
||||||
gpu-stream-acc
|
omp-stream
|
||||||
gpu-stream-omp3
|
acc-stream
|
||||||
gpu-stream-omp45
|
raja-stream
|
||||||
gpu-stream-sycl
|
kokkos-stream
|
||||||
|
sycl-stream
|
||||||
|
hip-stream
|
||||||
|
|
||||||
*.o
|
*.o
|
||||||
|
*.bc
|
||||||
|
*.sycl
|
||||||
*.tar
|
*.tar
|
||||||
*.gz
|
*.gz
|
||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
CMakeCache.txt
|
|
||||||
CMakeFiles/
|
|
||||||
cmake_install.cmake
|
|
||||||
Makefile
|
Makefile
|
||||||
|
|||||||
198
CMakeLists.txt
198
CMakeLists.txt
@ -1,198 +0,0 @@
|
|||||||
|
|
||||||
cmake_minimum_required(VERSION 3.2)
|
|
||||||
|
|
||||||
if(NOT DEFINED HIP_PATH)
|
|
||||||
if(NOT DEFINED ENV{HIP_PATH})
|
|
||||||
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
|
|
||||||
else()
|
|
||||||
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
|
|
||||||
|
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 11)
|
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
||||||
|
|
||||||
project(gpu-stream)
|
|
||||||
|
|
||||||
include(CheckIncludeFileCXX)
|
|
||||||
include(CheckCXXCompilerFlag)
|
|
||||||
|
|
||||||
set(gpu-stream_VERSION_MAJOR 2)
|
|
||||||
set(gpu-stream_VERSION_MINOR 2)
|
|
||||||
|
|
||||||
configure_file(common.h.in common.h)
|
|
||||||
include_directories(${CMAKE_BINARY_DIR})
|
|
||||||
|
|
||||||
# Use 'Release' if no build type specified
|
|
||||||
if (NOT CMAKE_BUILD_TYPE)
|
|
||||||
message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
|
|
||||||
set(CMAKE_BUILD_TYPE "Release")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# If using the Cray compiler, manually add the C++11 flag because setting the
|
|
||||||
# standard through CMake as above doesn't set this flag with Cray
|
|
||||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
|
|
||||||
list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# HIP
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
find_package(HIP QUIET)
|
|
||||||
if(${HIP_FOUND})
|
|
||||||
list(APPEND HIP_HIPCC_FLAGS --std=c++11)
|
|
||||||
hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu)
|
|
||||||
target_compile_definitions(gpu-stream-hip PUBLIC HIP)
|
|
||||||
else()
|
|
||||||
message("Skipping HIP...")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# CUDA
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
find_package(CUDA 7.0 QUIET)
|
|
||||||
set(FLAG True)
|
|
||||||
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
|
|
||||||
execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
|
|
||||||
if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
|
|
||||||
message("Xcode version not supported by CUDA")
|
|
||||||
set(FLAG False)
|
|
||||||
endif ()
|
|
||||||
endif ()
|
|
||||||
if (${FLAG} AND ${CUDA_FOUND})
|
|
||||||
list(APPEND CUDA_NVCC_FLAGS --std=c++11)
|
|
||||||
cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
|
|
||||||
target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
|
|
||||||
else ()
|
|
||||||
message("Skipping CUDA...")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# OpenCL
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
find_package(OpenCL QUIET)
|
|
||||||
if (${OpenCL_FOUND})
|
|
||||||
add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
|
|
||||||
target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
|
|
||||||
target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
|
|
||||||
else ()
|
|
||||||
message("Skipping OpenCL...")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# OpenACC
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# Check compiler supports an OpenACC flag
|
|
||||||
include(CheckCXXCompilerFlag)
|
|
||||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
|
|
||||||
CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
|
|
||||||
if (OPENACC)
|
|
||||||
list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
|
|
||||||
endif ()
|
|
||||||
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
|
|
||||||
CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
|
|
||||||
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
|
|
||||||
CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
if (OPENACC)
|
|
||||||
add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
|
|
||||||
target_compile_definitions(gpu-stream-acc PUBLIC ACC)
|
|
||||||
else ()
|
|
||||||
message("Skipping OpenACC...")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# OpenMP 3.0
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
find_package(OpenMP QUIET)
|
|
||||||
if (${OpenMP_FOUND})
|
|
||||||
add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
|
|
||||||
target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
|
|
||||||
else ()
|
|
||||||
message("Skipping OpenMP 3...")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# OpenMP 4.5
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
|
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
|
|
||||||
add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
|
|
||||||
target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
|
|
||||||
endif ()
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# RAJA
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
if (RAJA_PATH)
|
|
||||||
find_package(OpenMP)
|
|
||||||
find_package(CUDA 7.5)
|
|
||||||
list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
|
|
||||||
list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
|
|
||||||
list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
|
|
||||||
list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
|
|
||||||
cuda_include_directories(${RAJA_PATH}/include)
|
|
||||||
set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
|
|
||||||
cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
|
|
||||||
target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
|
|
||||||
target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
|
|
||||||
else()
|
|
||||||
message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# Kokkos
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
if (KOKKOS_PATH)
|
|
||||||
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
|
|
||||||
add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
|
|
||||||
else()
|
|
||||||
message("Skipping Kokkos (requires Linux)")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
# SYCL
|
|
||||||
#-------------------------------------------------------------------------------
|
|
||||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
|
|
||||||
"${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
|
|
||||||
# Use C++14 if available, otherwise drop back to C++11
|
|
||||||
check_cxx_compiler_flag("-std=c++14" CXX14)
|
|
||||||
if (CXX14)
|
|
||||||
set(CMAKE_REQUIRED_FLAGS "-std=c++14")
|
|
||||||
else()
|
|
||||||
set(CMAKE_REQUIRED_FLAGS "-std=c++11")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
|
|
||||||
if (HAS_SYCL)
|
|
||||||
add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
|
|
||||||
target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
|
|
||||||
|
|
||||||
# The user must define this in order to use FindComputeCpp
|
|
||||||
if (COMPUTECPP_PACKAGE_ROOT_DIR)
|
|
||||||
message(STATUS "Using ComputeCpp for SYCL compilation")
|
|
||||||
include(FindComputeCpp)
|
|
||||||
|
|
||||||
include_directories(${COMPUTECPP_INCLUDE_DIRECTORY})
|
|
||||||
|
|
||||||
set(SOURCE_NAME "SYCLStream")
|
|
||||||
|
|
||||||
target_compile_options(gpu-stream-sycl PUBLIC ${HOST_COMPILER_OPTIONS})
|
|
||||||
add_sycl_to_target(gpu-stream-sycl ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_NAME}.cpp
|
|
||||||
${CMAKE_CURRENT_BINARY_DIR})
|
|
||||||
else()
|
|
||||||
message(STATUS "Using header-only SYCL implementation")
|
|
||||||
set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
|
|
||||||
endif()
|
|
||||||
else ()
|
|
||||||
message("Skipping SYCL...")
|
|
||||||
endif (HAS_SYCL)
|
|
||||||
10
CUDA.make
Normal file
10
CUDA.make
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
CXXFLAGS=-O3
|
||||||
|
CUDA_CXX=nvcc
|
||||||
|
|
||||||
|
cuda-stream: main.cpp CUDAStream.cu
|
||||||
|
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f cuda-stream
|
||||||
|
|
||||||
@ -215,9 +215,7 @@ void CUDAStream<T>::triad()
|
|||||||
template <class T>
|
template <class T>
|
||||||
__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
|
__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
|
||||||
{
|
{
|
||||||
|
__shared__ T tb_sum[TBSIZE];
|
||||||
extern __shared__ __align__(sizeof(T)) unsigned char smem[];
|
|
||||||
T *tb_sum = reinterpret_cast<T*>(smem);
|
|
||||||
|
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
const size_t local_i = threadIdx.x;
|
const size_t local_i = threadIdx.x;
|
||||||
@ -242,7 +240,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array
|
|||||||
template <class T>
|
template <class T>
|
||||||
T CUDAStream<T>::dot()
|
T CUDAStream<T>::dot()
|
||||||
{
|
{
|
||||||
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE, sizeof(T)*TBSIZE>>>(d_a, d_b, d_sum, array_size);
|
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
|
||||||
check_error();
|
check_error();
|
||||||
|
|
||||||
#if defined(MANAGED) || defined(PAGEFAULT)
|
#if defined(MANAGED) || defined(PAGEFAULT)
|
||||||
|
|||||||
12
HIP.make
Normal file
12
HIP.make
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
|
||||||
|
# TODO: HIP with HCC
|
||||||
|
|
||||||
|
HIPCC = hipcc
|
||||||
|
|
||||||
|
hip-stream: main.cpp HIPStream.cpp
|
||||||
|
$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f hip-stream
|
||||||
|
|
||||||
@ -9,6 +9,7 @@
|
|||||||
#include "hip/hip_runtime.h"
|
#include "hip/hip_runtime.h"
|
||||||
|
|
||||||
#define TBSIZE 1024
|
#define TBSIZE 1024
|
||||||
|
#define DOT_NUM_BLOCKS 256
|
||||||
|
|
||||||
void check_error(void)
|
void check_error(void)
|
||||||
{
|
{
|
||||||
@ -47,6 +48,9 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
|
|||||||
|
|
||||||
array_size = ARRAY_SIZE;
|
array_size = ARRAY_SIZE;
|
||||||
|
|
||||||
|
// Allocate the host array for partial sums for dot kernels
|
||||||
|
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
|
||||||
|
|
||||||
// Check buffers fit on the device
|
// Check buffers fit on the device
|
||||||
hipDeviceProp_t props;
|
hipDeviceProp_t props;
|
||||||
hipGetDeviceProperties(&props, 0);
|
hipGetDeviceProperties(&props, 0);
|
||||||
@ -60,20 +64,27 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
|
|||||||
check_error();
|
check_error();
|
||||||
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
|
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
|
||||||
check_error();
|
check_error();
|
||||||
|
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
|
||||||
|
check_error();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
HIPStream<T>::~HIPStream()
|
HIPStream<T>::~HIPStream()
|
||||||
{
|
{
|
||||||
|
free(sums);
|
||||||
|
|
||||||
hipFree(d_a);
|
hipFree(d_a);
|
||||||
check_error();
|
check_error();
|
||||||
hipFree(d_b);
|
hipFree(d_b);
|
||||||
check_error();
|
check_error();
|
||||||
hipFree(d_c);
|
hipFree(d_c);
|
||||||
check_error();
|
check_error();
|
||||||
|
hipFree(d_sum);
|
||||||
|
check_error();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
|
__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
|
||||||
{
|
{
|
||||||
@ -171,6 +182,46 @@ void HIPStream<T>::triad()
|
|||||||
check_error();
|
check_error();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
__global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
|
||||||
|
{
|
||||||
|
__shared__ T tb_sum[TBSIZE];
|
||||||
|
|
||||||
|
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||||
|
const size_t local_i = hipThreadIdx_x;
|
||||||
|
|
||||||
|
tb_sum[local_i] = 0.0;
|
||||||
|
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
|
||||||
|
tb_sum[local_i] += a[i] * b[i];
|
||||||
|
|
||||||
|
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
|
||||||
|
{
|
||||||
|
__syncthreads();
|
||||||
|
if (local_i < offset)
|
||||||
|
{
|
||||||
|
tb_sum[local_i] += tb_sum[local_i+offset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (local_i == 0)
|
||||||
|
sum[hipBlockIdx_x] = tb_sum[local_i];
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
T HIPStream<T>::dot()
|
||||||
|
{
|
||||||
|
hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
|
||||||
|
check_error();
|
||||||
|
|
||||||
|
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
|
||||||
|
check_error();
|
||||||
|
|
||||||
|
T sum = 0.0;
|
||||||
|
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
|
||||||
|
sum += sums[i];
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
void listDevices(void)
|
void listDevices(void)
|
||||||
{
|
{
|
||||||
@ -21,10 +21,15 @@ class HIPStream : public Stream<T>
|
|||||||
protected:
|
protected:
|
||||||
// Size of arrays
|
// Size of arrays
|
||||||
unsigned int array_size;
|
unsigned int array_size;
|
||||||
|
|
||||||
|
// Host array for partial sums for dot kernel
|
||||||
|
T *sums;
|
||||||
|
|
||||||
// Device side pointers to arrays
|
// Device side pointers to arrays
|
||||||
T *d_a;
|
T *d_a;
|
||||||
T *d_b;
|
T *d_b;
|
||||||
T *d_c;
|
T *d_c;
|
||||||
|
T *d_sum;
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -36,6 +41,7 @@ class HIPStream : public Stream<T>
|
|||||||
virtual void add() override;
|
virtual void add() override;
|
||||||
virtual void mul() override;
|
virtual void mul() override;
|
||||||
virtual void triad() override;
|
virtual void triad() override;
|
||||||
|
virtual T dot() override;
|
||||||
|
|
||||||
virtual void init_arrays(T initA, T initB, T initC) override;
|
virtual void init_arrays(T initA, T initB, T initC) override;
|
||||||
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
|
||||||
|
|||||||
@ -39,10 +39,10 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
|
|||||||
View<double*, DEVICE> a(*d_a);
|
View<double*, DEVICE> a(*d_a);
|
||||||
View<double*, DEVICE> b(*d_b);
|
View<double*, DEVICE> b(*d_b);
|
||||||
View<double*, DEVICE> c(*d_c);
|
View<double*, DEVICE> c(*d_c);
|
||||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||||
{
|
{
|
||||||
a[index] = initA;
|
a[index] = initA;
|
||||||
b[index] - initB;
|
b[index] = initB;
|
||||||
c[index] = initC;
|
c[index] = initC;
|
||||||
});
|
});
|
||||||
Kokkos::fence();
|
Kokkos::fence();
|
||||||
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
|
|||||||
View<double*, DEVICE> b(*d_b);
|
View<double*, DEVICE> b(*d_b);
|
||||||
View<double*, DEVICE> c(*d_c);
|
View<double*, DEVICE> c(*d_c);
|
||||||
|
|
||||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||||
{
|
{
|
||||||
c[index] = a[index];
|
c[index] = a[index];
|
||||||
});
|
});
|
||||||
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
|
|||||||
View<double*, DEVICE> c(*d_c);
|
View<double*, DEVICE> c(*d_c);
|
||||||
|
|
||||||
const T scalar = startScalar;
|
const T scalar = startScalar;
|
||||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||||
{
|
{
|
||||||
b[index] = scalar*c[index];
|
b[index] = scalar*c[index];
|
||||||
});
|
});
|
||||||
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
|
|||||||
View<double*, DEVICE> b(*d_b);
|
View<double*, DEVICE> b(*d_b);
|
||||||
View<double*, DEVICE> c(*d_c);
|
View<double*, DEVICE> c(*d_c);
|
||||||
|
|
||||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||||
{
|
{
|
||||||
c[index] = a[index] + b[index];
|
c[index] = a[index] + b[index];
|
||||||
});
|
});
|
||||||
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
|
|||||||
View<double*, DEVICE> c(*d_c);
|
View<double*, DEVICE> c(*d_c);
|
||||||
|
|
||||||
const T scalar = startScalar;
|
const T scalar = startScalar;
|
||||||
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
|
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
|
||||||
{
|
{
|
||||||
a[index] = b[index] + scalar*c[index];
|
a[index] = b[index] + scalar*c[index];
|
||||||
});
|
});
|
||||||
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
|
|||||||
|
|
||||||
T sum = 0.0;
|
T sum = 0.0;
|
||||||
|
|
||||||
parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
|
parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
|
||||||
{
|
{
|
||||||
tmp += a[index] * b[index];
|
tmp += a[index] * b[index];
|
||||||
}, sum);
|
}, sum);
|
||||||
|
|||||||
45
Kokkos.make
Normal file
45
Kokkos.make
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
|
||||||
|
default: kokkos-stream
|
||||||
|
|
||||||
|
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
define compiler_help
|
||||||
|
Set COMPILER to change flags (defaulting to GNU).
|
||||||
|
Available compilers are:
|
||||||
|
GNU INTEL
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(compiler_help))
|
||||||
|
COMPILER=GNU
|
||||||
|
endif
|
||||||
|
|
||||||
|
COMPILER_GNU = g++
|
||||||
|
COMPILER_INTEL = icpc -qopt-streaming-stores=always
|
||||||
|
CXX = $(COMPILER_$(COMPILER))
|
||||||
|
|
||||||
|
ifndef TARGET
|
||||||
|
define target_help
|
||||||
|
Set TARGET to change to offload device. Defaulting to CPU.
|
||||||
|
Available targets are:
|
||||||
|
CPU (default)
|
||||||
|
GPU
|
||||||
|
endef
|
||||||
|
$(info $(target_help))
|
||||||
|
TARGET=CPU
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), CPU)
|
||||||
|
TARGET_DEF = -DKOKKOS_TARGET_CPU
|
||||||
|
else ifeq ($(TARGET), GPU)
|
||||||
|
CXX = $(NVCC_WRAPPER)
|
||||||
|
TARGET_DEF =
|
||||||
|
endif
|
||||||
|
|
||||||
|
kokkos-stream: main.cpp KOKKOSStream.cpp $(KOKKOS_CPP_DEPENDS)
|
||||||
|
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(KOKKOS_LDFLAGS) main.cpp KOKKOSStream.cpp $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f kokkos-stream
|
||||||
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
|
|
||||||
default: gpu-stream-kokkos
|
|
||||||
|
|
||||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
|
||||||
|
|
||||||
gpu-stream-kokkos: main.o KOKKOSStream.o
|
|
||||||
$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
|
|
||||||
|
|
||||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
|
|
||||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
|
|
||||||
default: gpu-stream-kokkos
|
|
||||||
|
|
||||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
|
||||||
|
|
||||||
gpu-stream-kokkos: main.o KOKKOSStream.o
|
|
||||||
$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3
|
|
||||||
|
|
||||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
|
|
||||||
$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3
|
|
||||||
|
|
||||||
14
LICENSE
14
LICENSE
@ -12,22 +12,22 @@
|
|||||||
* 3. You are free to publish results obtained from running this
|
* 3. You are free to publish results obtained from running this
|
||||||
* program, or from works that you derive from this program,
|
* program, or from works that you derive from this program,
|
||||||
* with the following limitations:
|
* with the following limitations:
|
||||||
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
|
* 3a. In order to be referred to as "BabelStream benchmark results",
|
||||||
* published results must be in conformance to the GPU-STREAM
|
* published results must be in conformance to the BabelStream
|
||||||
* Run Rules published at
|
* Run Rules published at
|
||||||
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
|
* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
|
||||||
* and incorporated herein by reference.
|
* and incorporated herein by reference.
|
||||||
* The copyright holders retain the
|
* The copyright holders retain the
|
||||||
* right to determine conformity with the Run Rules.
|
* right to determine conformity with the Run Rules.
|
||||||
* 3b. Results based on modified source code or on runs not in
|
* 3b. Results based on modified source code or on runs not in
|
||||||
* accordance with the GPU-STREAM Run Rules must be clearly
|
* accordance with the BabelStream Run Rules must be clearly
|
||||||
* labelled whenever they are published. Examples of
|
* labelled whenever they are published. Examples of
|
||||||
* proper labelling include:
|
* proper labelling include:
|
||||||
* "tuned GPU-STREAM benchmark results"
|
* "tuned BabelStream benchmark results"
|
||||||
* "based on a variant of the GPU-STREAM benchmark code"
|
* "based on a variant of the BabelStream benchmark code"
|
||||||
* Other comparable, clear and reasonable labelling is
|
* Other comparable, clear and reasonable labelling is
|
||||||
* acceptable.
|
* acceptable.
|
||||||
* 3c. Submission of results to the GPU-STREAM benchmark web site
|
* 3c. Submission of results to the BabelStream benchmark web site
|
||||||
* is encouraged, but not required.
|
* is encouraged, but not required.
|
||||||
* 4. Use of this program or creation of derived works based on this
|
* 4. Use of this program or creation of derived works based on this
|
||||||
* program constitutes acceptance of these licensing restrictions.
|
* program constitutes acceptance of these licensing restrictions.
|
||||||
|
|||||||
@ -7,6 +7,10 @@
|
|||||||
|
|
||||||
#include "OMPStream.h"
|
#include "OMPStream.h"
|
||||||
|
|
||||||
|
#ifndef ALIGNMENT
|
||||||
|
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||||
|
#endif
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
|
OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
|
||||||
{
|
{
|
||||||
@ -22,9 +26,9 @@ OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev
|
|||||||
{}
|
{}
|
||||||
#else
|
#else
|
||||||
// Allocate on the host
|
// Allocate on the host
|
||||||
this->a = (T*)malloc(sizeof(T)*array_size);
|
this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||||
this->b = (T*)malloc(sizeof(T)*array_size);
|
this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||||
this->c = (T*)malloc(sizeof(T)*array_size);
|
this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
52
OpenACC.make
Normal file
52
OpenACC.make
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
define compiler_help
|
||||||
|
Set COMPILER to ensure correct flags are set.
|
||||||
|
Available compilers are:
|
||||||
|
PGI CRAY
|
||||||
|
endef
|
||||||
|
$(info $(compiler_help))
|
||||||
|
endif
|
||||||
|
|
||||||
|
COMPILER_ = $(CXX)
|
||||||
|
COMPILER_PGI = pgc++
|
||||||
|
COMPILER_CRAY = CC
|
||||||
|
|
||||||
|
FLAGS_ = -O3 -std=c++11
|
||||||
|
|
||||||
|
FLAGS_PGI = -std=c++11 -O3 -acc
|
||||||
|
ifeq ($(COMPILER), PGI)
|
||||||
|
define target_help
|
||||||
|
Set a TARGET to ensure PGI targets the correct offload device.
|
||||||
|
Available targets are:
|
||||||
|
SNB, IVB, HSW
|
||||||
|
KEPLER, MAXWELL, PASCAL
|
||||||
|
HAWAII
|
||||||
|
endef
|
||||||
|
ifndef TARGET
|
||||||
|
$(error $(target_help))
|
||||||
|
endif
|
||||||
|
TARGET_FLAGS_SNB = -ta=multicore -tp=sandybridge
|
||||||
|
TARGET_FLAGS_IVB = -ta=multicore -tp=ivybridge
|
||||||
|
TARGET_FLAGS_HSW = -ta=multicore -tp=haswell
|
||||||
|
TARGET_FLAGS_KEPLER = -ta=nvidia:cc35
|
||||||
|
TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
|
||||||
|
TARGET_FLAGS_PASCAL = -ta=nvidia:cc60
|
||||||
|
TARGET_FLAGS_HAWAII = -ta=radeon:hawaii
|
||||||
|
ifeq ($(TARGET_FLAGS_$(TARGET)),)
|
||||||
|
$(error $(target_help))
|
||||||
|
endif
|
||||||
|
|
||||||
|
FLAGS_PGI += $(TARGET_FLAGS_$(TARGET))
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
|
FLAGS_CRAY = -hstd=c++11
|
||||||
|
CXXFLAGS = $(FLAGS_$(COMPILER))
|
||||||
|
|
||||||
|
acc-stream: main.cpp ACCStream.cpp
|
||||||
|
$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f acc-stream main.o ACCStream.o
|
||||||
39
OpenCL.make
Normal file
39
OpenCL.make
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
define compiler_help
|
||||||
|
Set COMPILER to change flags (defaulting to GNU).
|
||||||
|
Available compilers are:
|
||||||
|
GNU CLANG INTEL CRAY
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(compiler_help))
|
||||||
|
COMPILER=GNU
|
||||||
|
endif
|
||||||
|
|
||||||
|
COMPILER_GNU = g++
|
||||||
|
COMPILER_CLANG = clang++
|
||||||
|
COMPILER_INTEL = icpc
|
||||||
|
COMPILER_CRAY = CC
|
||||||
|
CXX = $(COMPILER_$(COMPILER))
|
||||||
|
|
||||||
|
FLAGS_ = -O3 -std=c++11
|
||||||
|
FLAGS_GNU = -O3 -std=c++11
|
||||||
|
FLAGS_CLANG = -O3 -std=c++11
|
||||||
|
FLAGS_INTEL = -O3 -std=c++11
|
||||||
|
FLAGS_CRAY = -O3 -hstd=c++11
|
||||||
|
CXXFLAGS=$(FLAGS_$(COMPILER))
|
||||||
|
|
||||||
|
PLATFORM = $(shell uname -s)
|
||||||
|
ifeq ($(PLATFORM), Darwin)
|
||||||
|
LIBS = -framework OpenCL
|
||||||
|
else
|
||||||
|
LIBS = -lOpenCL
|
||||||
|
endif
|
||||||
|
|
||||||
|
ocl-stream: main.cpp OCLStream.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f ocl-stream
|
||||||
|
|
||||||
60
OpenMP.make
Normal file
60
OpenMP.make
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
define compiler_help
|
||||||
|
Set COMPILER to change flags (defaulting to GNU).
|
||||||
|
Available compilers are:
|
||||||
|
CLANG CRAY GNU INTEL XL
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(compiler_help))
|
||||||
|
COMPILER=GNU
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef TARGET
|
||||||
|
define target_help
|
||||||
|
Set TARGET to change device (defaulting to CPU).
|
||||||
|
Available targets are:
|
||||||
|
CPU NVIDIA
|
||||||
|
|
||||||
|
endef
|
||||||
|
$(info $(target_help))
|
||||||
|
TARGET=CPU
|
||||||
|
endif
|
||||||
|
|
||||||
|
COMPILER_GNU = g++
|
||||||
|
COMPILER_INTEL = icpc
|
||||||
|
COMPILER_CRAY = CC
|
||||||
|
COMPILER_CLANG = clang++
|
||||||
|
COMPILER_XL = xlc++
|
||||||
|
CXX = $(COMPILER_$(COMPILER))
|
||||||
|
|
||||||
|
FLAGS_GNU = -O3 -std=c++11
|
||||||
|
FLAGS_INTEL = -O3 -std=c++11 -xHOST -qopt-streaming-stores=always
|
||||||
|
FLAGS_CRAY = -O3 -hstd=c++11
|
||||||
|
FLAGS_CLANG = -O3 -std=c++11
|
||||||
|
FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11
|
||||||
|
CXXFLAGS = $(FLAGS_$(COMPILER))
|
||||||
|
|
||||||
|
# OpenMP flags for CPUs
|
||||||
|
OMP_GNU_CPU = -fopenmp
|
||||||
|
OMP_INTEL_CPU = -qopenmp
|
||||||
|
OMP_CRAY_CPU = -homp
|
||||||
|
OMP_CLANG_CPU = -fopenmp=libomp
|
||||||
|
OMP_XL_CPU = -qsmp=omp -qthreaded
|
||||||
|
|
||||||
|
# OpenMP flags for NVIDIA
|
||||||
|
OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU
|
||||||
|
OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
|
||||||
|
|
||||||
|
ifndef OMP_$(COMPILER)_$(TARGET)
|
||||||
|
$(error Targeting $(TARGET) with $(COMPILER) not supported)
|
||||||
|
endif
|
||||||
|
|
||||||
|
OMP = $(OMP_$(COMPILER)_$(TARGET))
|
||||||
|
|
||||||
|
omp-stream: main.cpp OMPStream.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f omp-stream
|
||||||
58
RAJA.make
Normal file
58
RAJA.make
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
|
||||||
|
ifndef TARGET
|
||||||
|
define target_help
|
||||||
|
Set TARGET to change to offload device. Defaulting to CPU.
|
||||||
|
Available targets are:
|
||||||
|
CPU (default)
|
||||||
|
GPU
|
||||||
|
endef
|
||||||
|
$(info $(target_help))
|
||||||
|
TARGET=CPU
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), CPU)
|
||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
define compiler_help
|
||||||
|
Set COMPILER to change flags (defaulting to GNU).
|
||||||
|
Available compilers are:
|
||||||
|
INTEL GNU CRAY XL
|
||||||
|
endef
|
||||||
|
$(info $(compiler_help))
|
||||||
|
COMPILER=GNU
|
||||||
|
endif
|
||||||
|
|
||||||
|
CXX_INTEL = icpc
|
||||||
|
CXX_GNU = g++
|
||||||
|
CXX_CRAY = CC
|
||||||
|
CXX_XL = xlc++
|
||||||
|
|
||||||
|
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
|
||||||
|
CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp
|
||||||
|
CXXFLAGS_CRAY = -O3 -hstd=c++11
|
||||||
|
CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
|
||||||
|
|
||||||
|
CXX = $(CXX_$(COMPILER))
|
||||||
|
CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER))
|
||||||
|
|
||||||
|
else ifeq ($(TARGET), GPU)
|
||||||
|
CXX = nvcc
|
||||||
|
|
||||||
|
ifndef ARCH
|
||||||
|
define arch_help
|
||||||
|
Set ARCH to ensure correct GPU architecture.
|
||||||
|
Example:
|
||||||
|
ARCH=sm_35
|
||||||
|
endef
|
||||||
|
$(error $(arch_help))
|
||||||
|
endif
|
||||||
|
CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
|
||||||
|
endif
|
||||||
|
|
||||||
|
raja-stream: main.cpp RAJAStream.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f raja-stream
|
||||||
|
|
||||||
@ -10,6 +10,10 @@
|
|||||||
using RAJA::forall;
|
using RAJA::forall;
|
||||||
using RAJA::RangeSegment;
|
using RAJA::RangeSegment;
|
||||||
|
|
||||||
|
#ifndef ALIGNMENT
|
||||||
|
#define ALIGNMENT (2*1024*1024) // 2MB
|
||||||
|
#endif
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
||||||
: array_size(ARRAY_SIZE)
|
: array_size(ARRAY_SIZE)
|
||||||
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
|
|||||||
index_set.push_back(seg);
|
index_set.push_back(seg);
|
||||||
|
|
||||||
#ifdef RAJA_TARGET_CPU
|
#ifdef RAJA_TARGET_CPU
|
||||||
d_a = new T[ARRAY_SIZE];
|
d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||||
d_b = new T[ARRAY_SIZE];
|
d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||||
d_c = new T[ARRAY_SIZE];
|
d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
|
||||||
#else
|
#else
|
||||||
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||||
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
|
||||||
@ -33,9 +37,9 @@ template <class T>
|
|||||||
RAJAStream<T>::~RAJAStream()
|
RAJAStream<T>::~RAJAStream()
|
||||||
{
|
{
|
||||||
#ifdef RAJA_TARGET_CPU
|
#ifdef RAJA_TARGET_CPU
|
||||||
delete[] d_a;
|
free(d_a);
|
||||||
delete[] d_b;
|
free(d_b);
|
||||||
delete[] d_c;
|
free(d_c);
|
||||||
#else
|
#else
|
||||||
cudaFree(d_a);
|
cudaFree(d_a);
|
||||||
cudaFree(d_b);
|
cudaFree(d_b);
|
||||||
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
|
void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
|
||||||
{
|
{
|
||||||
T* a = d_a;
|
T* RAJA_RESTRICT a = d_a;
|
||||||
T* b = d_b;
|
T* RAJA_RESTRICT b = d_b;
|
||||||
T* c = d_c;
|
T* RAJA_RESTRICT c = d_c;
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||||
{
|
{
|
||||||
a[index] = initA;
|
a[index] = initA;
|
||||||
b[index] = initB;
|
b[index] = initB;
|
||||||
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
|
|||||||
template <class T>
|
template <class T>
|
||||||
void RAJAStream<T>::copy()
|
void RAJAStream<T>::copy()
|
||||||
{
|
{
|
||||||
T* a = d_a;
|
T* RAJA_RESTRICT a = d_a;
|
||||||
T* c = d_c;
|
T* RAJA_RESTRICT c = d_c;
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||||
{
|
{
|
||||||
c[index] = a[index];
|
c[index] = a[index];
|
||||||
});
|
});
|
||||||
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void RAJAStream<T>::mul()
|
void RAJAStream<T>::mul()
|
||||||
{
|
{
|
||||||
T* b = d_b;
|
T* RAJA_RESTRICT b = d_b;
|
||||||
T* c = d_c;
|
T* RAJA_RESTRICT c = d_c;
|
||||||
const T scalar = startScalar;
|
const T scalar = startScalar;
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||||
{
|
{
|
||||||
b[index] = scalar*c[index];
|
b[index] = scalar*c[index];
|
||||||
});
|
});
|
||||||
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void RAJAStream<T>::add()
|
void RAJAStream<T>::add()
|
||||||
{
|
{
|
||||||
T* a = d_a;
|
T* RAJA_RESTRICT a = d_a;
|
||||||
T* b = d_b;
|
T* RAJA_RESTRICT b = d_b;
|
||||||
T* c = d_c;
|
T* RAJA_RESTRICT c = d_c;
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||||
{
|
{
|
||||||
c[index] = a[index] + b[index];
|
c[index] = a[index] + b[index];
|
||||||
});
|
});
|
||||||
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
|
|||||||
template <class T>
|
template <class T>
|
||||||
void RAJAStream<T>::triad()
|
void RAJAStream<T>::triad()
|
||||||
{
|
{
|
||||||
T* a = d_a;
|
T* RAJA_RESTRICT a = d_a;
|
||||||
T* b = d_b;
|
T* RAJA_RESTRICT b = d_b;
|
||||||
T* c = d_c;
|
T* RAJA_RESTRICT c = d_c;
|
||||||
const T scalar = startScalar;
|
const T scalar = startScalar;
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||||
{
|
{
|
||||||
a[index] = b[index] + scalar*c[index];
|
a[index] = b[index] + scalar*c[index];
|
||||||
});
|
});
|
||||||
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
|
|||||||
template <class T>
|
template <class T>
|
||||||
T RAJAStream<T>::dot()
|
T RAJAStream<T>::dot()
|
||||||
{
|
{
|
||||||
T* a = d_a;
|
T* RAJA_RESTRICT a = d_a;
|
||||||
T* b = d_b;
|
T* RAJA_RESTRICT b = d_b;
|
||||||
|
|
||||||
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
|
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
|
||||||
|
|
||||||
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
|
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
|
||||||
{
|
{
|
||||||
sum += a[index] * b[index];
|
sum += a[index] * b[index];
|
||||||
});
|
});
|
||||||
|
|||||||
36
README.android
Normal file
36
README.android
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
Android (outdated instructions)
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Assuming you have a recent Android NDK available, you can use the
|
||||||
|
toolchain that it provides to build GPU-STREAM. You should first
|
||||||
|
use the NDK to generate a standalone toolchain:
|
||||||
|
|
||||||
|
# Select a directory to install the toolchain to
|
||||||
|
ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
|
||||||
|
|
||||||
|
${NDK}/build/tools/make-standalone-toolchain.sh \
|
||||||
|
--platform=android-14 \
|
||||||
|
--toolchain=arm-linux-androideabi-4.8 \
|
||||||
|
--install-dir=${ANDROID_NATIVE_TOOLCHAIN}
|
||||||
|
|
||||||
|
Make sure that the OpenCL headers and library (libOpenCL.so) are
|
||||||
|
available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
|
||||||
|
|
||||||
|
You should then be able to build GPU-STREAM:
|
||||||
|
|
||||||
|
make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
|
||||||
|
|
||||||
|
Copy the executable and OpenCL kernels to the device:
|
||||||
|
|
||||||
|
adb push gpu-stream-ocl /data/local/tmp
|
||||||
|
adb push ocl-stream-kernels.cl /data/local/tmp
|
||||||
|
|
||||||
|
Run GPU-STREAM from an adb shell:
|
||||||
|
|
||||||
|
adb shell
|
||||||
|
cd /data/local/tmp
|
||||||
|
|
||||||
|
# Use float if device doesn't support double, and reduce array size
|
||||||
|
./gpu-stream-ocl --float -n 6 -s 10000000
|
||||||
|
|
||||||
|
|
||||||
74
README.md
74
README.md
@ -1,4 +1,4 @@
|
|||||||
GPU-STREAM
|
BabelStream
|
||||||
==========
|
==========
|
||||||
|
|
||||||
Measure memory transfer rates to/from global device memory on GPUs.
|
Measure memory transfer rates to/from global device memory on GPUs.
|
||||||
@ -16,53 +16,51 @@ Currently implemented are:
|
|||||||
- RAJA
|
- RAJA
|
||||||
- SYCL
|
- SYCL
|
||||||
|
|
||||||
|
This code was previously called GPU-STREAM.
|
||||||
|
|
||||||
Website
|
Website
|
||||||
-------
|
-------
|
||||||
[uob-hpc.github.io/GPU-STREAM/](uob-hpc.github.io/GPU-STREAM/)
|
[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
|
|
||||||
CMake 3.2 or above is required.
|
Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
|
||||||
Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
|
|
||||||
|
|
||||||
Generate the Makefile with `cmake .`
|
We have supplied a series of Makefiles, one for each programming model, to assist with building.
|
||||||
|
The Makefiles contain common build options, and should be simple to customise for your needs too.
|
||||||
|
|
||||||
Android (outdated instructions)
|
General usage is `make -f <Model>.make`
|
||||||
------------------
|
Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
|
||||||
|
Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.
|
||||||
|
|
||||||
Assuming you have a recent Android NDK available, you can use the
|
Pass in extra flags via the `EXTRA_FLAGS` option.
|
||||||
toolchain that it provides to build GPU-STREAM. You should first
|
|
||||||
use the NDK to generate a standalone toolchain:
|
|
||||||
|
|
||||||
# Select a directory to install the toolchain to
|
The binaries are named in the form `<model>-stream`.
|
||||||
ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
|
|
||||||
|
|
||||||
${NDK}/build/tools/make-standalone-toolchain.sh \
|
Building Kokkos
|
||||||
--platform=android-14 \
|
---------------
|
||||||
--toolchain=arm-linux-androideabi-4.8 \
|
|
||||||
--install-dir=${ANDROID_NATIVE_TOOLCHAIN}
|
|
||||||
|
|
||||||
Make sure that the OpenCL headers and library (libOpenCL.so) are
|
We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
|
||||||
available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
|
```
|
||||||
|
../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
|
||||||
|
```
|
||||||
|
For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
|
||||||
|
```
|
||||||
|
../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
|
||||||
|
```
|
||||||
|
|
||||||
You should then be able to build GPU-STREAM:
|
Building RAJA
|
||||||
|
-------------
|
||||||
make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
|
|
||||||
|
|
||||||
Copy the executable and OpenCL kernels to the device:
|
|
||||||
|
|
||||||
adb push gpu-stream-ocl /data/local/tmp
|
|
||||||
adb push ocl-stream-kernels.cl /data/local/tmp
|
|
||||||
|
|
||||||
Run GPU-STREAM from an adb shell:
|
|
||||||
|
|
||||||
adb shell
|
|
||||||
cd /data/local/tmp
|
|
||||||
|
|
||||||
# Use float if device doesn't support double, and reduce array size
|
|
||||||
./gpu-stream-ocl --float -n 6 -s 10000000
|
|
||||||
|
|
||||||
|
We use the following command to build RAJA using the Intel Compiler.
|
||||||
|
```
|
||||||
|
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
|
||||||
|
```
|
||||||
|
For building with CUDA support, we use the following command.
|
||||||
|
```
|
||||||
|
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
|
||||||
|
```
|
||||||
|
|
||||||
Results
|
Results
|
||||||
-------
|
-------
|
||||||
@ -72,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
|
|||||||
Citing
|
Citing
|
||||||
------
|
------
|
||||||
|
|
||||||
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
|
Please cite BabelStream via this reference:
|
||||||
|
|
||||||
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
|
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
|
||||||
|
|
||||||
**Other GPU-STREAM publications:**
|
**Other BabelStream publications:**
|
||||||
|
|
||||||
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
|
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
|
||||||
|
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
|
||||||
|
|
||||||
|
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
|
||||||
|
You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
12
SYCL.make
Normal file
12
SYCL.make
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
|
||||||
|
COMPUTECPP_FLAGS = $(shell computecpp_info --dump-device-compiler-flags)
|
||||||
|
|
||||||
|
sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
|
||||||
|
$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@
|
||||||
|
|
||||||
|
SYCLStream.sycl: SYCLStream.cpp
|
||||||
|
compute++ SYCLStream.cpp $(COMPUTECPP_FLAGS) -c
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
rm -f sycl-stream SYCLStream.sycl SYCLStream.bc
|
||||||
@ -1,248 +0,0 @@
|
|||||||
#.rst:
|
|
||||||
# FindComputeCpp
|
|
||||||
#---------------
|
|
||||||
#
|
|
||||||
# Copyright 2016 Codeplay Software Ltd.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use these files except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
#########################
|
|
||||||
# FindComputeCpp.cmake
|
|
||||||
#########################
|
|
||||||
#
|
|
||||||
# Tools for finding and building with ComputeCpp.
|
|
||||||
#
|
|
||||||
# User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
|
|
||||||
# installation.
|
|
||||||
#
|
|
||||||
# Latest version of this file can be found at:
|
|
||||||
# https://github.com/codeplaysoftware/computecpp-sdk
|
|
||||||
|
|
||||||
# Require CMake version 3.2.2 or higher
|
|
||||||
cmake_minimum_required(VERSION 3.2.2)
|
|
||||||
|
|
||||||
# Check that a supported host compiler can be found
|
|
||||||
if(CMAKE_COMPILER_IS_GNUCXX)
|
|
||||||
# Require at least gcc 4.8
|
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"host compiler - Not found! (gcc version must be at least 4.8)")
|
|
||||||
# Require the GCC dual ABI to be disabled for 5.1 or higher
|
|
||||||
elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
|
|
||||||
set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
|
|
||||||
message(STATUS
|
|
||||||
"host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
|
|
||||||
else()
|
|
||||||
message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
|
|
||||||
endif()
|
|
||||||
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
|
||||||
# Require at least clang 3.6
|
|
||||||
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"host compiler - Not found! (clang version must be at least 3.6)")
|
|
||||||
else()
|
|
||||||
message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(WARNING
|
|
||||||
"host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(COMPUTECPP_64_BIT_DEFAULT ON)
|
|
||||||
option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
|
|
||||||
${COMPUTECPP_64_BIT_DEFAULT})
|
|
||||||
mark_as_advanced(COMPUTECPP_64_BIT_CODE)
|
|
||||||
|
|
||||||
# Find OpenCL package
|
|
||||||
find_package(OpenCL REQUIRED)
|
|
||||||
|
|
||||||
# Find ComputeCpp packagee
|
|
||||||
if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
|
|
||||||
else()
|
|
||||||
message(STATUS "ComputeCpp package - Found")
|
|
||||||
endif()
|
|
||||||
option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
|
|
||||||
|
|
||||||
# Obtain the path to compute++
|
|
||||||
find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
|
|
||||||
${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
|
|
||||||
if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
|
|
||||||
mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
|
|
||||||
message(STATUS "compute++ - Found")
|
|
||||||
else()
|
|
||||||
message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Obtain the path to computecpp_info
|
|
||||||
find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
|
|
||||||
${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
|
|
||||||
if (EXISTS ${COMPUTECPP_INFO_TOOL})
|
|
||||||
mark_as_advanced(${COMPUTECPP_INFO_TOOL})
|
|
||||||
message(STATUS "computecpp_info - Found")
|
|
||||||
else()
|
|
||||||
message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Obtain the path to the ComputeCpp runtime library
|
|
||||||
find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
|
|
||||||
HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
|
|
||||||
DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
|
|
||||||
|
|
||||||
if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
|
|
||||||
mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
|
|
||||||
message(STATUS "libComputeCpp.so - Found")
|
|
||||||
else()
|
|
||||||
message(FATAL_ERROR "libComputeCpp.so - Not found!")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Obtain the ComputeCpp include directory
|
|
||||||
set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
|
|
||||||
if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
|
|
||||||
message(FATAL_ERROR "ComputeCpp includes - Not found!")
|
|
||||||
else()
|
|
||||||
message(STATUS "ComputeCpp includes - Found")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Obtain the package version
|
|
||||||
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
|
|
||||||
OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
|
|
||||||
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
|
|
||||||
message(FATAL_ERROR "Package version - Error obtaining version!")
|
|
||||||
else()
|
|
||||||
mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
|
|
||||||
message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Obtain the device compiler flags
|
|
||||||
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
|
|
||||||
OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
|
|
||||||
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
|
|
||||||
message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
|
|
||||||
else()
|
|
||||||
mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
|
|
||||||
message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Check if the platform is supported
|
|
||||||
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
|
|
||||||
OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
|
|
||||||
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
|
|
||||||
message(FATAL_ERROR "platform - Error checking platform support!")
|
|
||||||
else()
|
|
||||||
mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
|
|
||||||
if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
|
|
||||||
message(STATUS "platform - your system can support ComputeCpp")
|
|
||||||
else()
|
|
||||||
message(STATUS "platform - your system CANNOT support ComputeCpp")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
####################
|
|
||||||
# __build_sycl
|
|
||||||
####################
|
|
||||||
#
|
|
||||||
# Adds a custom target for running compute++ and adding a dependency for the
|
|
||||||
# resulting integration header.
|
|
||||||
#
|
|
||||||
# targetName : Name of the target.
|
|
||||||
# sourceFile : Source file to be compiled.
|
|
||||||
# binaryDir : Intermediate directory to output the integration header.
|
|
||||||
#
|
|
||||||
function(__build_spir targetName sourceFile binaryDir)
|
|
||||||
|
|
||||||
# Retrieve source file name.
|
|
||||||
get_filename_component(sourceFileName ${sourceFile} NAME)
|
|
||||||
|
|
||||||
# Set the path to the Sycl file.
|
|
||||||
set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
|
|
||||||
|
|
||||||
# Add any user-defined include to the device compiler
|
|
||||||
get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
|
|
||||||
INCLUDE_DIRECTORIES)
|
|
||||||
set(device_compiler_includes "")
|
|
||||||
foreach(directory ${includeDirectories})
|
|
||||||
set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
|
|
||||||
endforeach()
|
|
||||||
if (CMAKE_INCLUDE_PATH)
|
|
||||||
foreach(directory ${CMAKE_INCLUDE_PATH})
|
|
||||||
set(device_compiler_includes "-I${directory}"
|
|
||||||
${device_compiler_includes})
|
|
||||||
endforeach()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Convert argument list format
|
|
||||||
separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
|
|
||||||
|
|
||||||
# Add custom command for running compute++
|
|
||||||
add_custom_command(
|
|
||||||
OUTPUT ${outputSyclFile}
|
|
||||||
COMMAND ${COMPUTECPP_DEVICE_COMPILER}
|
|
||||||
${COMPUTECPP_DEVICE_COMPILER_FLAGS}
|
|
||||||
-isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
|
|
||||||
${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
|
|
||||||
${device_compiler_includes}
|
|
||||||
-o ${outputSyclFile}
|
|
||||||
-c ${sourceFile}
|
|
||||||
DEPENDS ${sourceFile}
|
|
||||||
WORKING_DIRECTORY ${binaryDir}
|
|
||||||
COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
|
|
||||||
|
|
||||||
# Add a custom target for the generated integration header
|
|
||||||
add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
|
|
||||||
|
|
||||||
# Add a dependency on the integration header
|
|
||||||
add_dependencies(${targetName} ${targetName}_integration_header)
|
|
||||||
|
|
||||||
# Force inclusion of the integration header for the host compiler
|
|
||||||
set(compileFlags -include ${outputSyclFile} "-Wall")
|
|
||||||
target_compile_options(${targetName} PUBLIC ${compileFlags})
|
|
||||||
|
|
||||||
# Set the host compiler C++ standard to C++11
|
|
||||||
set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
|
|
||||||
|
|
||||||
# Disable GCC dual ABI on GCC 5.1 and higher
|
|
||||||
if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
|
|
||||||
set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
|
|
||||||
"_GLIBCXX_USE_CXX11_ABI=0")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
endfunction()
|
|
||||||
|
|
||||||
#######################
|
|
||||||
# add_sycl_to_target
|
|
||||||
#######################
|
|
||||||
#
|
|
||||||
# Adds a SYCL compilation custom command associated with an existing
|
|
||||||
# target and sets a dependancy on that new command.
|
|
||||||
#
|
|
||||||
# targetName : Name of the target to add a SYCL to.
|
|
||||||
# sourceFile : Source file to be compiled for SYCL.
|
|
||||||
# binaryDir : Intermediate directory to output the integration header.
|
|
||||||
#
|
|
||||||
function(add_sycl_to_target targetName sourceFile binaryDir)
|
|
||||||
|
|
||||||
# Add custom target to run compute++ and generate the integration header
|
|
||||||
__build_spir(${targetName} ${sourceFile} ${binaryDir})
|
|
||||||
|
|
||||||
# Link with the ComputeCpp runtime library
|
|
||||||
target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
|
|
||||||
PUBLIC ${OpenCL_LIBRARIES})
|
|
||||||
|
|
||||||
endfunction(add_sycl_to_target)
|
|
||||||
|
|
||||||
4
main.cpp
4
main.cpp
@ -15,7 +15,7 @@
|
|||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
||||||
#define VERSION_STRING "devel"
|
#define VERSION_STRING "3.2"
|
||||||
|
|
||||||
#include "Stream.h"
|
#include "Stream.h"
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]);
|
|||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
std::cout
|
std::cout
|
||||||
<< "GPU-STREAM" << std::endl
|
<< "BabelStream" << std::endl
|
||||||
<< "Version: " << VERSION_STRING << std::endl
|
<< "Version: " << VERSION_STRING << std::endl
|
||||||
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
|
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user