Merge branch 'master' into cuda-memory

This commit is contained in:
Tom Deakin 2017-05-11 11:28:33 +01:00
commit 3bd65a0716
23 changed files with 485 additions and 562 deletions

19
.gitignore vendored
View File

@ -1,18 +1,19 @@
gpu-stream-cuda
gpu-stream-ocl
gpu-stream-acc
gpu-stream-omp3
gpu-stream-omp45
gpu-stream-sycl
cuda-stream
ocl-stream
omp-stream
acc-stream
raja-stream
kokkos-stream
sycl-stream
hip-stream
*.o
*.bc
*.sycl
*.tar
*.gz
.DS_Store
CMakeCache.txt
CMakeFiles/
cmake_install.cmake
Makefile

View File

@ -1,198 +0,0 @@
cmake_minimum_required(VERSION 3.2)
if(NOT DEFINED HIP_PATH)
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
else()
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
endif()
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
project(gpu-stream)
include(CheckIncludeFileCXX)
include(CheckCXXCompilerFlag)
set(gpu-stream_VERSION_MAJOR 2)
set(gpu-stream_VERSION_MINOR 2)
configure_file(common.h.in common.h)
include_directories(${CMAKE_BINARY_DIR})
# Use 'Release' if no build type specified
if (NOT CMAKE_BUILD_TYPE)
message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
set(CMAKE_BUILD_TYPE "Release")
endif()
# If using the Cray compiler, manually add the C++11 flag because setting the
# standard through CMake as above doesn't set this flag with Cray
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
list(APPEND CMAKE_CXX_FLAGS -hstd=c++11)
endif ()
#-------------------------------------------------------------------------------
# HIP
#-------------------------------------------------------------------------------
find_package(HIP QUIET)
if(${HIP_FOUND})
list(APPEND HIP_HIPCC_FLAGS --std=c++11)
hip_add_executable(gpu-stream-hip main.cpp HIPStream.cu)
target_compile_definitions(gpu-stream-hip PUBLIC HIP)
else()
message("Skipping HIP...")
endif()
#-------------------------------------------------------------------------------
# CUDA
#-------------------------------------------------------------------------------
find_package(CUDA 7.0 QUIET)
set(FLAG True)
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Darwin")
execute_process(COMMAND xcodebuild -version COMMAND head -n 1 OUTPUT_VARIABLE XCODE_VERSION)
if ("${XCODE_VERSION}" MATCHES "Xcode 7.3.1")
message("Xcode version not supported by CUDA")
set(FLAG False)
endif ()
endif ()
if (${FLAG} AND ${CUDA_FOUND})
list(APPEND CUDA_NVCC_FLAGS --std=c++11)
cuda_add_executable(gpu-stream-cuda main.cpp CUDAStream.cu)
target_compile_definitions(gpu-stream-cuda PUBLIC CUDA)
else ()
message("Skipping CUDA...")
endif ()
#-------------------------------------------------------------------------------
# OpenCL
#-------------------------------------------------------------------------------
find_package(OpenCL QUIET)
if (${OpenCL_FOUND})
add_executable(gpu-stream-ocl main.cpp OCLStream.cpp)
target_compile_definitions(gpu-stream-ocl PUBLIC OCL)
target_link_libraries(gpu-stream-ocl ${OpenCL_LIBRARY})
else ()
message("Skipping OpenCL...")
endif ()
#-------------------------------------------------------------------------------
# OpenACC
#-------------------------------------------------------------------------------
# Check compiler supports an OpenACC flag
include(CheckCXXCompilerFlag)
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
CHECK_CXX_COMPILER_FLAG(-fopenacc OPENACC)
if (OPENACC)
list (APPEND CMAKE_EXE_LINKER_FLAGS -fopenacc)
endif ()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "PGI")
CHECK_CXX_COMPILER_FLAG(-acc OPENACC)
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
CHECK_CXX_COMPILER_FLAG(-hacc=openacc OPENACC)
endif ()
if (OPENACC)
add_executable(gpu-stream-acc main.cpp ACCStream.cpp)
target_compile_definitions(gpu-stream-acc PUBLIC ACC)
else ()
message("Skipping OpenACC...")
endif ()
#-------------------------------------------------------------------------------
# OpenMP 3.0
#-------------------------------------------------------------------------------
find_package(OpenMP QUIET)
if (${OpenMP_FOUND})
add_executable(gpu-stream-omp3 main.cpp OMP3Stream.cpp)
target_compile_definitions(gpu-stream-omp3 PUBLIC OMP3)
else ()
message("Skipping OpenMP 3...")
endif ()
#-------------------------------------------------------------------------------
# OpenMP 4.5
#-------------------------------------------------------------------------------
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Cray")
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.5)
add_executable(gpu-stream-omp45 main.cpp OMP45Stream.cpp)
target_compile_definitions(gpu-stream-omp45 PUBLIC OMP45)
endif ()
endif ()
#-------------------------------------------------------------------------------
# RAJA
#-------------------------------------------------------------------------------
if (RAJA_PATH)
find_package(OpenMP)
find_package(CUDA 7.5)
list(APPEND CUDA_NVCC_FLAGS "-arch compute_35")
list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${OpenMP_CXX_FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_RAJA)
cuda_include_directories(${RAJA_PATH}/include)
set_source_files_properties(RAJAStream.cpp PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
cuda_add_executable(gpu-stream-raja main.cpp RAJAStream.cpp)
target_compile_definitions(gpu-stream-raja PUBLIC USE_RAJA)
target_link_libraries(gpu-stream-raja "-L${RAJA_PATH}/lib -lRAJA")
else()
message("Skipping RAJA... (use -DRAJA_PATH=/path/to/raja to opt in)")
endif()
#-------------------------------------------------------------------------------
# Kokkos
#-------------------------------------------------------------------------------
if (KOKKOS_PATH)
if ("${CMAKE_SYSTEM_NAME}" MATCHES "Linux")
add_custom_target(gpu-stream-kokkos COMMAND make -f KokkosMakefile KOKKOS_PATH=${KOKKOS_PATH})
else()
message("Skipping Kokkos (requires Linux)")
endif()
else()
message("Skipping Kokkos... (use -DKOKKOS_PATH=/path/to/kokkos to opt in)")
endif()
#-------------------------------------------------------------------------------
# SYCL
#-------------------------------------------------------------------------------
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR
"${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
# Use C++14 if available, otherwise drop back to C++11
check_cxx_compiler_flag("-std=c++14" CXX14)
if (CXX14)
set(CMAKE_REQUIRED_FLAGS "-std=c++14")
else()
set(CMAKE_REQUIRED_FLAGS "-std=c++11")
endif()
endif()
check_include_file_cxx("CL/sycl.hpp" HAS_SYCL)
if (HAS_SYCL)
add_executable(gpu-stream-sycl main.cpp SYCLStream.cpp)
target_compile_definitions(gpu-stream-sycl PUBLIC SYCL)
# The user must define this in order to use FindComputeCpp
if (COMPUTECPP_PACKAGE_ROOT_DIR)
message(STATUS "Using ComputeCpp for SYCL compilation")
include(FindComputeCpp)
include_directories(${COMPUTECPP_INCLUDE_DIRECTORY})
set(SOURCE_NAME "SYCLStream")
target_compile_options(gpu-stream-sycl PUBLIC ${HOST_COMPILER_OPTIONS})
add_sycl_to_target(gpu-stream-sycl ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_NAME}.cpp
${CMAKE_CURRENT_BINARY_DIR})
else()
message(STATUS "Using header-only SYCL implementation")
set_property(TARGET gpu-stream-sycl PROPERTY CXX_STANDARD 14)
endif()
else ()
message("Skipping SYCL...")
endif (HAS_SYCL)

10
CUDA.make Normal file
View File

@ -0,0 +1,10 @@
CXXFLAGS=-O3
CUDA_CXX=nvcc
cuda-stream: main.cpp CUDAStream.cu
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f cuda-stream

View File

@ -215,9 +215,7 @@ void CUDAStream<T>::triad()
template <class T>
__global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array_size)
{
extern __shared__ __align__(sizeof(T)) unsigned char smem[];
T *tb_sum = reinterpret_cast<T*>(smem);
__shared__ T tb_sum[TBSIZE];
int i = blockDim.x * blockIdx.x + threadIdx.x;
const size_t local_i = threadIdx.x;
@ -242,7 +240,7 @@ __global__ void dot_kernel(const T * a, const T * b, T * sum, unsigned int array
template <class T>
T CUDAStream<T>::dot()
{
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE, sizeof(T)*TBSIZE>>>(d_a, d_b, d_sum, array_size);
dot_kernel<<<DOT_NUM_BLOCKS, TBSIZE>>>(d_a, d_b, d_sum, array_size);
check_error();
#if defined(MANAGED) || defined(PAGEFAULT)

12
HIP.make Normal file
View File

@ -0,0 +1,12 @@
# TODO: HIP with HCC
HIPCC = hipcc
hip-stream: main.cpp HIPStream.cpp
$(HIPCC) $(CXXFLAGS) -std=c++11 -DHIP $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f hip-stream

View File

@ -9,6 +9,7 @@
#include "hip/hip_runtime.h"
#define TBSIZE 1024
#define DOT_NUM_BLOCKS 256
void check_error(void)
{
@ -47,6 +48,9 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
array_size = ARRAY_SIZE;
// Allocate the host array for partial sums for dot kernels
sums = (T*)malloc(sizeof(T) * DOT_NUM_BLOCKS);
// Check buffers fit on the device
hipDeviceProp_t props;
hipGetDeviceProperties(&props, 0);
@ -60,20 +64,27 @@ HIPStream<T>::HIPStream(const unsigned int ARRAY_SIZE, const int device_index)
check_error();
hipMalloc(&d_c, ARRAY_SIZE*sizeof(T));
check_error();
hipMalloc(&d_sum, DOT_NUM_BLOCKS*sizeof(T));
check_error();
}
template <class T>
HIPStream<T>::~HIPStream()
{
free(sums);
hipFree(d_a);
check_error();
hipFree(d_b);
check_error();
hipFree(d_c);
check_error();
hipFree(d_sum);
check_error();
}
template <typename T>
__global__ void init_kernel(hipLaunchParm lp, T * a, T * b, T * c, T initA, T initB, T initC)
{
@ -171,6 +182,46 @@ void HIPStream<T>::triad()
check_error();
}
template <class T>
__global__ void dot_kernel(hipLaunchParm lp, const T * a, const T * b, T * sum, unsigned int array_size)
{
__shared__ T tb_sum[TBSIZE];
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const size_t local_i = hipThreadIdx_x;
tb_sum[local_i] = 0.0;
for (; i < array_size; i += hipBlockDim_x*hipGridDim_x)
tb_sum[local_i] += a[i] * b[i];
for (int offset = hipBlockDim_x / 2; offset > 0; offset /= 2)
{
__syncthreads();
if (local_i < offset)
{
tb_sum[local_i] += tb_sum[local_i+offset];
}
}
if (local_i == 0)
sum[hipBlockIdx_x] = tb_sum[local_i];
}
template <class T>
T HIPStream<T>::dot()
{
hipLaunchKernel(HIP_KERNEL_NAME(dot_kernel), dim3(DOT_NUM_BLOCKS), dim3(TBSIZE), 0, 0, d_a, d_b, d_sum, array_size);
check_error();
hipMemcpy(sums, d_sum, DOT_NUM_BLOCKS*sizeof(T), hipMemcpyDeviceToHost);
check_error();
T sum = 0.0;
for (int i = 0; i < DOT_NUM_BLOCKS; i++)
sum += sums[i];
return sum;
}
void listDevices(void)
{

View File

@ -21,10 +21,15 @@ class HIPStream : public Stream<T>
protected:
// Size of arrays
unsigned int array_size;
// Host array for partial sums for dot kernel
T *sums;
// Device side pointers to arrays
T *d_a;
T *d_b;
T *d_c;
T *d_sum;
public:
@ -36,6 +41,7 @@ class HIPStream : public Stream<T>
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual T dot() override;
virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;

View File

@ -39,10 +39,10 @@ void KOKKOSStream<T>::init_arrays(T initA, T initB, T initC)
View<double*, DEVICE> a(*d_a);
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
a[index] = initA;
b[index] - initB;
b[index] = initB;
c[index] = initC;
});
Kokkos::fence();
@ -70,7 +70,7 @@ void KOKKOSStream<T>::copy()
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
c[index] = a[index];
});
@ -85,7 +85,7 @@ void KOKKOSStream<T>::mul()
View<double*, DEVICE> c(*d_c);
const T scalar = startScalar;
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
b[index] = scalar*c[index];
});
@ -99,7 +99,7 @@ void KOKKOSStream<T>::add()
View<double*, DEVICE> b(*d_b);
View<double*, DEVICE> c(*d_c);
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
c[index] = a[index] + b[index];
});
@ -114,7 +114,7 @@ void KOKKOSStream<T>::triad()
View<double*, DEVICE> c(*d_c);
const T scalar = startScalar;
parallel_for(array_size, KOKKOS_LAMBDA (const int index)
parallel_for(array_size, KOKKOS_LAMBDA (const long index)
{
a[index] = b[index] + scalar*c[index];
});
@ -129,7 +129,7 @@ T KOKKOSStream<T>::dot()
T sum = 0.0;
parallel_reduce(array_size, KOKKOS_LAMBDA (const int index, double &tmp)
parallel_reduce(array_size, KOKKOS_LAMBDA (const long index, double &tmp)
{
tmp += a[index] * b[index];
}, sum);

45
Kokkos.make Normal file
View File

@ -0,0 +1,45 @@
default: kokkos-stream
include $(KOKKOS_PATH)/Makefile.kokkos
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU INTEL
endef
$(info $(compiler_help))
COMPILER=GNU
endif
COMPILER_GNU = g++
COMPILER_INTEL = icpc -qopt-streaming-stores=always
CXX = $(COMPILER_$(COMPILER))
ifndef TARGET
define target_help
Set TARGET to change to offload device. Defaulting to CPU.
Available targets are:
CPU (default)
GPU
endef
$(info $(target_help))
TARGET=CPU
endif
ifeq ($(TARGET), CPU)
TARGET_DEF = -DKOKKOS_TARGET_CPU
else ifeq ($(TARGET), GPU)
CXX = $(NVCC_WRAPPER)
TARGET_DEF =
endif
kokkos-stream: main.cpp KOKKOSStream.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(KOKKOS_LDFLAGS) main.cpp KOKKOSStream.cpp $(KOKKOS_LIBS) -o $@ -DKOKKOS $(TARGET_DEF) -O3 $(EXTRA_FLAGS)
.PHONY: clean
clean:
rm -f kokkos-stream

View File

@ -1,10 +0,0 @@
default: gpu-stream-kokkos
include $(KOKKOS_PATH)/Makefile.kokkos
gpu-stream-kokkos: main.o KOKKOSStream.o
$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -DKOKKOS_TARGET_CPU -O3
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -DKOKKOS_TARGET_CPU -O3

View File

@ -1,11 +0,0 @@
default: gpu-stream-kokkos
include $(KOKKOS_PATH)/Makefile.kokkos
gpu-stream-kokkos: main.o KOKKOSStream.o
$(CXX) $(KOKKOS_LDFLAGS) $^ $(KOKKOS_LIBS) -o $@ -DKOKKOS -O3
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(NVCC_WRAPPER) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) -c $< -DKOKKOS -O3

14
LICENSE
View File

@ -12,22 +12,22 @@
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "GPU-STREAM benchmark results",
* published results must be in conformance to the GPU-STREAM
* 3a. In order to be referred to as "BabelStream benchmark results",
* published results must be in conformance to the BabelStream
* Run Rules published at
* http://github.com/UoB-HPC/GPU-STREAM/wiki/Run-Rules
* http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules
* and incorporated herein by reference.
* The copyright holders retain the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the GPU-STREAM Run Rules must be clearly
* accordance with the BabelStream Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned GPU-STREAM benchmark results"
* "based on a variant of the GPU-STREAM benchmark code"
* "tuned BabelStream benchmark results"
* "based on a variant of the BabelStream benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the GPU-STREAM benchmark web site
* 3c. Submission of results to the BabelStream benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.

View File

@ -7,6 +7,10 @@
#include "OMPStream.h"
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
template <class T>
OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int device)
{
@ -22,9 +26,9 @@ OMPStream<T>::OMPStream(const unsigned int ARRAY_SIZE, T *a, T *b, T *c, int dev
{}
#else
// Allocate on the host
this->a = (T*)malloc(sizeof(T)*array_size);
this->b = (T*)malloc(sizeof(T)*array_size);
this->c = (T*)malloc(sizeof(T)*array_size);
this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
this->c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
#endif
}

52
OpenACC.make Normal file
View File

@ -0,0 +1,52 @@
ifndef COMPILER
define compiler_help
Set COMPILER to ensure correct flags are set.
Available compilers are:
PGI CRAY
endef
$(info $(compiler_help))
endif
COMPILER_ = $(CXX)
COMPILER_PGI = pgc++
COMPILER_CRAY = CC
FLAGS_ = -O3 -std=c++11
FLAGS_PGI = -std=c++11 -O3 -acc
ifeq ($(COMPILER), PGI)
define target_help
Set a TARGET to ensure PGI targets the correct offload device.
Available targets are:
SNB, IVB, HSW
KEPLER, MAXWELL, PASCAL
HAWAII
endef
ifndef TARGET
$(error $(target_help))
endif
TARGET_FLAGS_SNB = -ta=multicore -tp=sandybridge
TARGET_FLAGS_IVB = -ta=multicore -tp=ivybridge
TARGET_FLAGS_HSW = -ta=multicore -tp=haswell
TARGET_FLAGS_KEPLER = -ta=nvidia:cc35
TARGET_FLAGS_MAXWELL = -ta=nvidia:cc50
TARGET_FLAGS_PASCAL = -ta=nvidia:cc60
TARGET_FLAGS_HAWAII = -ta=radeon:hawaii
ifeq ($(TARGET_FLAGS_$(TARGET)),)
$(error $(target_help))
endif
FLAGS_PGI += $(TARGET_FLAGS_$(TARGET))
endif
FLAGS_CRAY = -hstd=c++11
CXXFLAGS = $(FLAGS_$(COMPILER))
acc-stream: main.cpp ACCStream.cpp
$(COMPILER_$(COMPILER)) $(CXXFLAGS) -DACC $^ $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f acc-stream main.o ACCStream.o

39
OpenCL.make Normal file
View File

@ -0,0 +1,39 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
GNU CLANG INTEL CRAY
endef
$(info $(compiler_help))
COMPILER=GNU
endif
COMPILER_GNU = g++
COMPILER_CLANG = clang++
COMPILER_INTEL = icpc
COMPILER_CRAY = CC
CXX = $(COMPILER_$(COMPILER))
FLAGS_ = -O3 -std=c++11
FLAGS_GNU = -O3 -std=c++11
FLAGS_CLANG = -O3 -std=c++11
FLAGS_INTEL = -O3 -std=c++11
FLAGS_CRAY = -O3 -hstd=c++11
CXXFLAGS=$(FLAGS_$(COMPILER))
PLATFORM = $(shell uname -s)
ifeq ($(PLATFORM), Darwin)
LIBS = -framework OpenCL
else
LIBS = -lOpenCL
endif
ocl-stream: main.cpp OCLStream.cpp
$(CXX) $(CXXFLAGS) -DOCL $^ $(EXTRA_FLAGS) $(LIBS) -o $@
.PHONY: clean
clean:
rm -f ocl-stream

60
OpenMP.make Normal file
View File

@ -0,0 +1,60 @@
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
CLANG CRAY GNU INTEL XL
endef
$(info $(compiler_help))
COMPILER=GNU
endif
ifndef TARGET
define target_help
Set TARGET to change device (defaulting to CPU).
Available targets are:
CPU NVIDIA
endef
$(info $(target_help))
TARGET=CPU
endif
COMPILER_GNU = g++
COMPILER_INTEL = icpc
COMPILER_CRAY = CC
COMPILER_CLANG = clang++
COMPILER_XL = xlc++
CXX = $(COMPILER_$(COMPILER))
FLAGS_GNU = -O3 -std=c++11
FLAGS_INTEL = -O3 -std=c++11 -xHOST -qopt-streaming-stores=always
FLAGS_CRAY = -O3 -hstd=c++11
FLAGS_CLANG = -O3 -std=c++11
FLAGS_XL = -O5 -qarch=pwr8 -qtune=pwr8 -std=c++11
CXXFLAGS = $(FLAGS_$(COMPILER))
# OpenMP flags for CPUs
OMP_GNU_CPU = -fopenmp
OMP_INTEL_CPU = -qopenmp
OMP_CRAY_CPU = -homp
OMP_CLANG_CPU = -fopenmp=libomp
OMP_XL_CPU = -qsmp=omp -qthreaded
# OpenMP flags for NVIDIA
OMP_CRAY_NVIDIA = -DOMP_TARGET_GPU
OMP_CLANG_NVIDIA = -DOMP_TARGET_GPU -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda
ifndef OMP_$(COMPILER)_$(TARGET)
$(error Targeting $(TARGET) with $(COMPILER) not supported)
endif
OMP = $(OMP_$(COMPILER)_$(TARGET))
omp-stream: main.cpp OMPStream.cpp
$(CXX) $(CXXFLAGS) -DOMP $^ $(OMP) $(EXTRA_FLAGS) -o $@
.PHONY: clean
clean:
rm -f omp-stream

58
RAJA.make Normal file
View File

@ -0,0 +1,58 @@
ifndef TARGET
define target_help
Set TARGET to change to offload device. Defaulting to CPU.
Available targets are:
CPU (default)
GPU
endef
$(info $(target_help))
TARGET=CPU
endif
ifeq ($(TARGET), CPU)
ifndef COMPILER
define compiler_help
Set COMPILER to change flags (defaulting to GNU).
Available compilers are:
INTEL GNU CRAY XL
endef
$(info $(compiler_help))
COMPILER=GNU
endif
CXX_INTEL = icpc
CXX_GNU = g++
CXX_CRAY = CC
CXX_XL = xlc++
CXXFLAGS_INTEL = -O3 -std=c++11 -qopenmp -xHost -qopt-streaming-stores=always
CXXFLAGS_GNU = -O3 -std=c++11 -fopenmp
CXXFLAGS_CRAY = -O3 -hstd=c++11
CXXFLAGS_XL = -O5 -std=c++11 -qarch=pwr8 -qtune=pwr8 -qsmp=omp -qthreaded
CXX = $(CXX_$(COMPILER))
CXXFLAGS = -DRAJA_TARGET_CPU $(CXXFLAGS_$(COMPILER))
else ifeq ($(TARGET), GPU)
CXX = nvcc
ifndef ARCH
define arch_help
Set ARCH to ensure correct GPU architecture.
Example:
ARCH=sm_35
endef
$(error $(arch_help))
endif
CXXFLAGS = --expt-extended-lambda -O3 -std=c++11 -x cu -Xcompiler -fopenmp -arch $(ARCH)
endif
raja-stream: main.cpp RAJAStream.cpp
$(CXX) $(CXXFLAGS) -DUSE_RAJA -I$(RAJA_PATH)/include $^ $(EXTRA_FLAGS) -L$(RAJA_PATH)/lib -lRAJA -o $@
.PHONY: clean
clean:
rm -f raja-stream

View File

@ -10,6 +10,10 @@
using RAJA::forall;
using RAJA::RangeSegment;
#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
template <class T>
RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
: array_size(ARRAY_SIZE)
@ -18,9 +22,9 @@ RAJAStream<T>::RAJAStream(const unsigned int ARRAY_SIZE, const int device_index)
index_set.push_back(seg);
#ifdef RAJA_TARGET_CPU
d_a = new T[ARRAY_SIZE];
d_b = new T[ARRAY_SIZE];
d_c = new T[ARRAY_SIZE];
d_a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size);
#else
cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal);
@ -33,9 +37,9 @@ template <class T>
RAJAStream<T>::~RAJAStream()
{
#ifdef RAJA_TARGET_CPU
delete[] d_a;
delete[] d_b;
delete[] d_c;
free(d_a);
free(d_b);
free(d_c);
#else
cudaFree(d_a);
cudaFree(d_b);
@ -46,10 +50,10 @@ RAJAStream<T>::~RAJAStream()
template <class T>
void RAJAStream<T>::init_arrays(T initA, T initB, T initC)
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
a[index] = initA;
b[index] = initB;
@ -69,9 +73,9 @@ void RAJAStream<T>::read_arrays(
template <class T>
void RAJAStream<T>::copy()
{
T* a = d_a;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
c[index] = a[index];
});
@ -80,10 +84,10 @@ void RAJAStream<T>::copy()
template <class T>
void RAJAStream<T>::mul()
{
T* b = d_b;
T* c = d_c;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
const T scalar = startScalar;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
b[index] = scalar*c[index];
});
@ -92,10 +96,10 @@ void RAJAStream<T>::mul()
template <class T>
void RAJAStream<T>::add()
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
c[index] = a[index] + b[index];
});
@ -104,11 +108,11 @@ void RAJAStream<T>::add()
template <class T>
void RAJAStream<T>::triad()
{
T* a = d_a;
T* b = d_b;
T* c = d_c;
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
T* RAJA_RESTRICT c = d_c;
const T scalar = startScalar;
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
a[index] = b[index] + scalar*c[index];
});
@ -117,12 +121,12 @@ void RAJAStream<T>::triad()
template <class T>
T RAJAStream<T>::dot()
{
T* a = d_a;
T* b = d_b;
T* RAJA_RESTRICT a = d_a;
T* RAJA_RESTRICT b = d_b;
RAJA::ReduceSum<reduce_policy, T> sum(0.0);
forall<policy>(index_set, [=] RAJA_DEVICE (int index)
forall<policy>(index_set, [=] RAJA_DEVICE (RAJA::Index_type index)
{
sum += a[index] * b[index];
});

36
README.android Normal file
View File

@ -0,0 +1,36 @@
Android (outdated instructions)
------------------
Assuming you have a recent Android NDK available, you can use the
toolchain that it provides to build GPU-STREAM. You should first
use the NDK to generate a standalone toolchain:
# Select a directory to install the toolchain to
ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
${NDK}/build/tools/make-standalone-toolchain.sh \
--platform=android-14 \
--toolchain=arm-linux-androideabi-4.8 \
--install-dir=${ANDROID_NATIVE_TOOLCHAIN}
Make sure that the OpenCL headers and library (libOpenCL.so) are
available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
You should then be able to build GPU-STREAM:
make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
Copy the executable and OpenCL kernels to the device:
adb push gpu-stream-ocl /data/local/tmp
adb push ocl-stream-kernels.cl /data/local/tmp
Run GPU-STREAM from an adb shell:
adb shell
cd /data/local/tmp
# Use float if device doesn't support double, and reduce array size
./gpu-stream-ocl --float -n 6 -s 10000000

View File

@ -1,4 +1,4 @@
GPU-STREAM
BabelStream
==========
Measure memory transfer rates to/from global device memory on GPUs.
@ -16,53 +16,51 @@ Currently implemented are:
- RAJA
- SYCL
This code was previously called GPU-STREAM.
Website
-------
[uob-hpc.github.io/GPU-STREAM/](uob-hpc.github.io/GPU-STREAM/)
[uob-hpc.github.io/BabelStream/](https://uob-hpc.github.io/BabelStream/)
Usage
-----
CMake 3.2 or above is required.
Drivers, compiler and software applicable to whichever implementation you would like to build against. Our build system is designed to only build implementations in programming models that your system supports.
Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
Generate the Makefile with `cmake .`
We have supplied a series of Makefiles, one for each programming model, to assist with building.
The Makefiles contain common build options, and should be simple to customise for your needs too.
Android (outdated instructions)
------------------
General usage is `make -f <Model>.make`
Common compiler flags and names can be set by passing a `COMPILER` option to Make, e.g. `make COMPILER=GNU`.
Some models allow specifying a CPU or GPU style target, and this can be set by passing a `TARGET` option to Make, e.g. `make TARGET=GPU`.
Assuming you have a recent Android NDK available, you can use the
toolchain that it provides to build GPU-STREAM. You should first
use the NDK to generate a standalone toolchain:
Pass in extra flags via the `EXTRA_FLAGS` option.
# Select a directory to install the toolchain to
ANDROID_NATIVE_TOOLCHAIN=/path/to/toolchain
The binaries are named in the form `<model>-stream`.
${NDK}/build/tools/make-standalone-toolchain.sh \
--platform=android-14 \
--toolchain=arm-linux-androideabi-4.8 \
--install-dir=${ANDROID_NATIVE_TOOLCHAIN}
Building Kokkos
---------------
Make sure that the OpenCL headers and library (libOpenCL.so) are
available in `${ANDROID_NATIVE_TOOLCHAIN}/sysroot/usr/`.
We use the following command to build Kokkos using the Intel Compiler, specifying the `arch` appropriately, e.g. `KNL`.
```
../generate_makefile.bash --prefix=<prefix> --with-openmp --with-pthread --arch=<arch> --compiler=icpc --cxxflags=-DKOKKOS_MEMORY_ALIGNMENT=2097152
```
For building with CUDA support, we use the following command, specifying the `arch` appropriately, e.g. `Kepler35`.
```
../generate_makefile.bash --prefix=<prefix> --with-cuda --with-openmp --with-pthread --arch=<arch> --with-cuda-options=enable_lambda
```
You should then be able to build GPU-STREAM:
make CXX=${ANDROID_NATIVE_TOOLCHAIN}/bin/arm-linux-androideabi-g++
Copy the executable and OpenCL kernels to the device:
adb push gpu-stream-ocl /data/local/tmp
adb push ocl-stream-kernels.cl /data/local/tmp
Run GPU-STREAM from an adb shell:
adb shell
cd /data/local/tmp
# Use float if device doesn't support double, and reduce array size
./gpu-stream-ocl --float -n 6 -s 10000000
Building RAJA
-------------
We use the following command to build RAJA using the Intel Compiler.
```
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DCMAKE_BUILD_TYPE=ICCBuild -DRAJA_ENABLE_TESTS=Off
```
For building with CUDA support, we use the following command.
```
cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
```
Results
-------
@ -72,13 +70,17 @@ Sample results can be found in the `results` subdirectory. If you would like to
Citing
------
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html) on GPU-STREAM presented at SC'15. Please cite GPU-STREAM via this reference:
Please cite BabelStream via this reference:
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking the achievable memory bandwidth of many-core processors across diverse parallel programming models. 2016. Paper presented at P^3MA Workshop at ISC High Performance, Frankfurt, Germany.
**Other GPU-STREAM publications:**
**Other BabelStream publications:**
> Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States.
You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html).
> Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States.
You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html).

12
SYCL.make Normal file
View File

@ -0,0 +1,12 @@
COMPUTECPP_FLAGS = $(shell computecpp_info --dump-device-compiler-flags)
sycl-stream: main.cpp SYCLStream.cpp SYCLStream.sycl
$(CXX) -O3 -std=c++11 -DSYCL main.cpp SYCLStream.cpp -include SYCLStream.sycl $(EXTRA_FLAGS) -lComputeCpp -lOpenCL -o $@
SYCLStream.sycl: SYCLStream.cpp
compute++ SYCLStream.cpp $(COMPUTECPP_FLAGS) -c
.PHONY: clean
clean:
rm -f sycl-stream SYCLStream.sycl SYCLStream.bc

View File

@ -1,248 +0,0 @@
#.rst:
# FindComputeCpp
#---------------
#
# Copyright 2016 Codeplay Software Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use these files except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#########################
# FindComputeCpp.cmake
#########################
#
# Tools for finding and building with ComputeCpp.
#
# User must define COMPUTECPP_PACKAGE_ROOT_DIR pointing to the ComputeCpp
# installation.
#
# Latest version of this file can be found at:
# https://github.com/codeplaysoftware/computecpp-sdk
# Require CMake version 3.2.2 or higher
cmake_minimum_required(VERSION 3.2.2)
# Check that a supported host compiler can be found
if(CMAKE_COMPILER_IS_GNUCXX)
# Require at least gcc 4.8
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
message(FATAL_ERROR
"host compiler - Not found! (gcc version must be at least 4.8)")
# Require the GCC dual ABI to be disabled for 5.1 or higher
elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1)
set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True")
message(STATUS
"host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)")
else()
message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# Require at least clang 3.6
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
message(FATAL_ERROR
"host compiler - Not found! (clang version must be at least 3.6)")
else()
message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}")
endif()
else()
message(WARNING
"host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)")
endif()
set(COMPUTECPP_64_BIT_DEFAULT ON)
option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode"
${COMPUTECPP_64_BIT_DEFAULT})
mark_as_advanced(COMPUTECPP_64_BIT_CODE)
# Find OpenCL package
find_package(OpenCL REQUIRED)
# Find ComputeCpp packagee
if(NOT COMPUTECPP_PACKAGE_ROOT_DIR)
message(FATAL_ERROR
"ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR")
else()
message(STATUS "ComputeCpp package - Found")
endif()
option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package")
# Obtain the path to compute++
find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS
${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
if (EXISTS ${COMPUTECPP_DEVICE_COMPILER})
mark_as_advanced(COMPUTECPP_DEVICE_COMPILER)
message(STATUS "compute++ - Found")
else()
message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER})")
endif()
# Obtain the path to computecpp_info
find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS
${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin)
if (EXISTS ${COMPUTECPP_INFO_TOOL})
mark_as_advanced(${COMPUTECPP_INFO_TOOL})
message(STATUS "computecpp_info - Found")
else()
message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL})")
endif()
# Obtain the path to the ComputeCpp runtime library
find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR}
HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib
DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH)
if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY})
mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY)
message(STATUS "libComputeCpp.so - Found")
else()
message(FATAL_ERROR "libComputeCpp.so - Not found!")
endif()
# Obtain the ComputeCpp include directory
set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)
if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY})
message(FATAL_ERROR "ComputeCpp includes - Not found!")
else()
message(STATUS "ComputeCpp includes - Found")
endif()
# Obtain the package version
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version"
OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
message(FATAL_ERROR "Package version - Error obtaining version!")
else()
mark_as_advanced(COMPUTECPP_PACKAGE_VERSION)
message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}")
endif()
# Obtain the device compiler flags
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags"
OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!")
else()
mark_as_advanced(COMPUTECPP_COMPILER_FLAGS)
message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
endif()
# Check if the platform is supported
execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED
RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0")
message(FATAL_ERROR "platform - Error checking platform support!")
else()
mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED)
if (COMPUTECPP_PLATFORM_IS_SUPPORTED)
message(STATUS "platform - your system can support ComputeCpp")
else()
message(STATUS "platform - your system CANNOT support ComputeCpp")
endif()
endif()
####################
# __build_sycl
####################
#
# Adds a custom target for running compute++ and adding a dependency for the
# resulting integration header.
#
# targetName : Name of the target.
# sourceFile : Source file to be compiled.
# binaryDir : Intermediate directory to output the integration header.
#
function(__build_spir targetName sourceFile binaryDir)
# Retrieve source file name.
get_filename_component(sourceFileName ${sourceFile} NAME)
# Set the path to the Sycl file.
set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl)
# Add any user-defined include to the device compiler
get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY
INCLUDE_DIRECTORIES)
set(device_compiler_includes "")
foreach(directory ${includeDirectories})
set(device_compiler_includes "-I${directory}" ${device_compiler_includes})
endforeach()
if (CMAKE_INCLUDE_PATH)
foreach(directory ${CMAKE_INCLUDE_PATH})
set(device_compiler_includes "-I${directory}"
${device_compiler_includes})
endforeach()
endif()
# Convert argument list format
separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS)
# Add custom command for running compute++
add_custom_command(
OUTPUT ${outputSyclFile}
COMMAND ${COMPUTECPP_DEVICE_COMPILER}
${COMPUTECPP_DEVICE_COMPILER_FLAGS}
-isystem ${COMPUTECPP_INCLUDE_DIRECTORY}
${COMPUTECPP_PLATFORM_SPECIFIC_ARGS}
${device_compiler_includes}
-o ${outputSyclFile}
-c ${sourceFile}
DEPENDS ${sourceFile}
WORKING_DIRECTORY ${binaryDir}
COMMENT "Building ComputeCpp integration header file ${outputSyclFile}")
# Add a custom target for the generated integration header
add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile})
# Add a dependency on the integration header
add_dependencies(${targetName} ${targetName}_integration_header)
# Force inclusion of the integration header for the host compiler
set(compileFlags -include ${outputSyclFile} "-Wall")
target_compile_options(${targetName} PUBLIC ${compileFlags})
# Set the host compiler C++ standard to C++11
set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11)
# Disable GCC dual ABI on GCC 5.1 and higher
if(COMPUTECPP_DISABLE_GCC_DUAL_ABI)
set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS
"_GLIBCXX_USE_CXX11_ABI=0")
endif()
endfunction()
#######################
# add_sycl_to_target
#######################
#
# Adds a SYCL compilation custom command associated with an existing
# target and sets a dependancy on that new command.
#
# targetName : Name of the target to add a SYCL to.
# sourceFile : Source file to be compiled for SYCL.
# binaryDir : Intermediate directory to output the integration header.
#
function(add_sycl_to_target targetName sourceFile binaryDir)
# Add custom target to run compute++ and generate the integration header
__build_spir(${targetName} ${sourceFile} ${binaryDir})
# Link with the ComputeCpp runtime library
target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY}
PUBLIC ${OpenCL_LIBRARIES})
endfunction(add_sycl_to_target)

View File

@ -15,7 +15,7 @@
#include <iomanip>
#include <cstring>
#define VERSION_STRING "devel"
#define VERSION_STRING "3.2"
#include "Stream.h"
@ -54,7 +54,7 @@ void parseArguments(int argc, char *argv[]);
int main(int argc, char *argv[])
{
std::cout
<< "GPU-STREAM" << std::endl
<< "BabelStream" << std::endl
<< "Version: " << VERSION_STRING << std::endl
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;