Merge branch 'time_init_read' into develop

This commit is contained in:
Tom Lin 2023-10-07 15:09:52 +01:00
commit a27abfe296
28 changed files with 435 additions and 136 deletions

View File

@ -7,6 +7,8 @@ All notable changes to this project will be documented in this file.
- Thrust managed memory.
- HIP managed memory.
- New implementation using SYCL2020 USM (sycl2020-acc) and renamed original `sycl2020` to `sycl2020-acc`.
- Data initialisation and read-back timing for all models, including Java, Scala, Julia, and Rust
- Add support for the latest Aparapi (3.0.0) and TornadoVM (0.15.x) for Java
### Changed
- RAJA CUDA CMake build issues resolved.
@ -17,6 +19,7 @@ All notable changes to this project will be documented in this file.
- Number of thread-blocks in CUDA dot kernel implementation changed to 1024.
- Fix compatibility of `sycl2020` (now `sycl2020-acc`) with hipSYCL.
- Bumped Julia compat to 1.9
- Bumped Scala to 3.3.1
- Bumped Rust to 1.74.0-nightly (13e6f24b9 2023-09-23)

View File

@ -12,7 +12,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<junit.version>5.7.2</junit.version>
<junit.version>5.9.2</junit.version>
</properties>
<repositories>
@ -27,19 +27,19 @@
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<version>1.81</version>
<version>1.82</version>
</dependency>
<dependency>
<groupId>tornado</groupId>
<artifactId>tornado-api</artifactId>
<version>0.9</version>
<version>0.15.1</version>
</dependency>
<dependency>
<groupId>com.aparapi</groupId>
<artifactId>aparapi</artifactId>
<version>2.0.0</version>
<version>3.0.0</version>
<exclusions>
<!-- don't pull in the entire Scala ecosystem! -->
<exclusion>

View File

@ -56,7 +56,7 @@ public abstract class JavaStream<T> {
protected abstract T dot();
protected abstract Data<T> data();
protected abstract Data<T> readArrays();
public static class EnumeratedStream<T> extends JavaStream<T> {
@ -113,8 +113,8 @@ public abstract class JavaStream<T> {
}
@Override
public Data<T> data() {
return actual.data();
public Data<T> readArrays() {
return actual.readArrays();
}
}
@ -140,6 +140,14 @@ public abstract class JavaStream<T> {
return Duration.ofNanos(end - start);
}
final Duration runInitArrays() {
return timed(this::initArrays);
}
final SimpleImmutableEntry<Duration, Data<T>> runReadArrays() {
return timed(this::readArrays);
}
final SimpleImmutableEntry<Timings<Duration>, T> runAll(int times) {
Timings<Duration> timings = new Timings<>();
T lastSum = null;

View File

@ -128,6 +128,40 @@ public class Main {
}
}
@SuppressWarnings("unchecked")
static void showInit(
int totalBytes, double megaScale, Options opt, Duration init, Duration read) {
List<Entry<String, Double>> setup =
Arrays.asList(
new SimpleImmutableEntry<>("Init", durationToSeconds(init)),
new SimpleImmutableEntry<>("Read", durationToSeconds(read)));
if (opt.csv) {
tabulateCsv(
true,
setup.stream()
.map(
x ->
Arrays.asList(
new SimpleImmutableEntry<>("function", x.getKey()),
new SimpleImmutableEntry<>("n_elements", opt.arraysize + ""),
new SimpleImmutableEntry<>("sizeof", totalBytes + ""),
new SimpleImmutableEntry<>(
"max_m" + (opt.mibibytes ? "i" : "") + "bytes_per_sec",
((megaScale * (double) totalBytes / x.getValue())) + ""),
new SimpleImmutableEntry<>("runtime", x.getValue() + "")))
.toArray(List[]::new));
} else {
for (Entry<String, Double> e : setup) {
System.out.printf(
"%s: %.5f s (%.5f M%sBytes/sec)%n",
e.getKey(),
e.getValue(),
megaScale * (double) totalBytes / e.getValue(),
opt.mibibytes ? "i" : "");
}
}
}
static <T extends Number> boolean run(
String name, Config<T> config, Function<Config<T>, JavaStream<T>> mkStream) {
@ -183,35 +217,46 @@ public class Main {
JavaStream<T> stream = mkStream.apply(config);
stream.initArrays();
Duration init = stream.runInitArrays();
final boolean ok;
switch (config.benchmark) {
case ALL:
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.of(results.getValue()));
Timings<Duration> timings = results.getKey();
tabulateCsv(
opt.csv,
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
break;
{
Entry<Timings<Duration>, T> results = stream.runAll(opt.numtimes);
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
showInit(totalBytes, megaScale, opt, init, read.getKey());
ok = checkSolutions(read.getValue(), config, Optional.of(results.getValue()));
Timings<Duration> timings = results.getKey();
tabulateCsv(
opt.csv,
mkCsvRow(timings.copy, "Copy", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.mul, "Mul", 2 * arrayBytes, megaScale, opt),
mkCsvRow(timings.add, "Add", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.triad, "Triad", 3 * arrayBytes, megaScale, opt),
mkCsvRow(timings.dot, "Dot", 2 * arrayBytes, megaScale, opt));
break;
}
case NSTREAM:
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
break;
{
List<Duration> nstreamResults = stream.runNStream(opt.numtimes);
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
showInit(totalBytes, megaScale, opt, init, read.getKey());
ok = checkSolutions(read.getValue(), config, Optional.empty());
tabulateCsv(opt.csv, mkCsvRow(nstreamResults, "Nstream", 4 * arrayBytes, megaScale, opt));
break;
}
case TRIAD:
Duration triadResult = stream.runTriad(opt.numtimes);
ok = checkSolutions(stream.data(), config, Optional.empty());
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
break;
{
Duration triadResult = stream.runTriad(opt.numtimes);
SimpleImmutableEntry<Duration, Data<T>> read = stream.runReadArrays();
showInit(totalBytes, megaScale, opt, init, read.getKey());
ok = checkSolutions(read.getValue(), config, Optional.empty());
int triadTotalBytes = 3 * arrayBytes * opt.numtimes;
double bandwidth = megaScale * (triadTotalBytes / durationToSeconds(triadResult));
System.out.printf("Runtime (seconds): %.5f", durationToSeconds(triadResult));
System.out.printf("Bandwidth (%s/s): %.3f ", gigaSuffix, bandwidth);
break;
}
default:
throw new AssertionError();
}

View File

@ -122,7 +122,7 @@ public final class AparapiStreams {
}
@Override
public Data<T> data() {
public Data<T> readArrays() {
return kernels.syncAndDispose();
}
}

View File

@ -86,7 +86,7 @@ final class GenericPlainStream<T extends Number> extends JavaStream<T> {
}
@Override
public Data<T> data() {
public Data<T> readArrays() {
return new Data<>(a, b, c);
}
}

View File

@ -80,7 +80,7 @@ final class GenericStream<T extends Number> extends JavaStream<T> {
}
@Override
public Data<T> data() {
public Data<T> readArrays() {
return new Data<>(a, b, c);
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedDoubleStream extends JavaStream<Double> {
}
@Override
public Data<Double> data() {
public Data<Double> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedFloatStream extends JavaStream<Float> {
}
@Override
public Data<Float> data() {
public Data<Float> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedPlainDoubleStream extends JavaStream<Double> {
}
@Override
public Data<Double> data() {
public Data<Double> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -78,7 +78,7 @@ final class SpecialisedPlainFloatStream extends JavaStream<Float> {
}
@Override
public Data<Float> data() {
public Data<Float> readArrays() {
return new Data<>(boxed(a), boxed(b), boxed(c));
}
}

View File

@ -4,8 +4,8 @@ import java.util.List;
import java.util.stream.Collectors;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskSchedule;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
@ -13,18 +13,18 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
protected final TornadoDevice device;
protected TaskSchedule copyTask;
protected TaskSchedule mulTask;
protected TaskSchedule addTask;
protected TaskSchedule triadTask;
protected TaskSchedule nstreamTask;
protected TaskSchedule dotTask;
protected TornadoExecutionPlan copyTask;
protected TornadoExecutionPlan mulTask;
protected TornadoExecutionPlan addTask;
protected TornadoExecutionPlan triadTask;
protected TornadoExecutionPlan nstreamTask;
protected TornadoExecutionPlan dotTask;
GenericTornadoVMStream(Config<T> config) {
super(config);
try {
TornadoRuntimeCI runtime = TornadoRuntime.getTornadoRuntime();
TornadoRuntimeInterface runtime = TornadoRuntime.getTornadoRuntime();
List<TornadoDevice> devices = TornadoVMStreams.enumerateDevices(runtime);
device = devices.get(config.options.device);
@ -42,10 +42,6 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
}
}
protected static TaskSchedule mkSchedule() {
return new TaskSchedule("");
}
@Override
public List<String> listDevices() {
return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream()
@ -55,12 +51,12 @@ abstract class GenericTornadoVMStream<T> extends JavaStream<T> {
@Override
public void initArrays() {
this.copyTask.warmup();
this.mulTask.warmup();
this.addTask.warmup();
this.triadTask.warmup();
this.nstreamTask.warmup();
this.dotTask.warmup();
this.copyTask.withWarmUp();
this.mulTask.withWarmUp();
this.addTask.withWarmUp();
this.triadTask.withWarmUp();
this.nstreamTask.withWarmUp();
this.dotTask.withWarmUp();
}
@Override

View File

@ -2,8 +2,11 @@ package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
@ -49,7 +52,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
private final double[] a, b, c;
private final double[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
@SuppressWarnings({"DuplicatedCode"})
SpecialisedDouble(Config<Double> config) {
super(config);
final int size = config.options.arraysize;
@ -58,12 +61,43 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
b = new double[size];
c = new double[size];
dotSum = new double[1];
this.copyTask = mkSchedule().task("", SpecialisedDouble::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedDouble::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedDouble::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedDouble::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedDouble::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedDouble::dot_, a, b, dotSum).streamOut(dotSum);
this.copyTask =
new TornadoExecutionPlan(
new TaskGraph("copy")
.task("copy", SpecialisedDouble::copy, size, a, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
.snapshot());
this.mulTask =
new TornadoExecutionPlan(
new TaskGraph("mul")
.task("mul", SpecialisedDouble::mul, size, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
.snapshot());
this.addTask =
new TornadoExecutionPlan(
new TaskGraph("add")
.task("add", SpecialisedDouble::add, size, a, b, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.triadTask =
new TornadoExecutionPlan(
new TaskGraph("triad")
.task("triad", SpecialisedDouble::triad, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.nstreamTask =
new TornadoExecutionPlan(
new TaskGraph("nstream")
.task("nstream", SpecialisedDouble::nstream, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.dotTask =
new TornadoExecutionPlan(
new TaskGraph("dot")
.task("dot", SpecialisedDouble::dot_, a, b, dotSum)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
.snapshot());
}
@Override
@ -72,7 +106,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
}
@Override
@ -81,7 +115,7 @@ final class SpecialisedDouble extends GenericTornadoVMStream<Double> {
}
@Override
public Data<Double> data() {
public Data<Double> readArrays() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}

View File

@ -2,8 +2,11 @@ package javastream.tornadovm;
import java.util.Arrays;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TaskGraph;
import uk.ac.manchester.tornado.api.TornadoExecutionPlan;
import uk.ac.manchester.tornado.api.annotations.Parallel;
import uk.ac.manchester.tornado.api.annotations.Reduce;
import uk.ac.manchester.tornado.api.enums.DataTransferMode;
final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
@ -49,7 +52,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
private final float[] a, b, c;
private final float[] dotSum;
@SuppressWarnings({"PrimitiveArrayArgumentToVarargsMethod", "DuplicatedCode"})
@SuppressWarnings({"DuplicatedCode"})
SpecialisedFloat(Config<Float> config) {
super(config);
final int size = config.options.arraysize;
@ -58,12 +61,43 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
b = new float[size];
c = new float[size];
dotSum = new float[1];
this.copyTask = mkSchedule().task("", SpecialisedFloat::copy, size, a, c);
this.mulTask = mkSchedule().task("", SpecialisedFloat::mul, size, b, c, scalar);
this.addTask = mkSchedule().task("", SpecialisedFloat::add, size, a, b, c);
this.triadTask = mkSchedule().task("", SpecialisedFloat::triad, size, a, b, c, scalar);
this.nstreamTask = mkSchedule().task("", SpecialisedFloat::nstream, size, a, b, c, scalar);
this.dotTask = mkSchedule().task("", SpecialisedFloat::dot_, a, b, dotSum).streamOut(dotSum);
this.copyTask =
new TornadoExecutionPlan(
new TaskGraph("copy")
.task("copy", SpecialisedFloat::copy, size, a, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, c)
.snapshot());
this.mulTask =
new TornadoExecutionPlan(
new TaskGraph("mul")
.task("mul", SpecialisedFloat::mul, size, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, b, c)
.snapshot());
this.addTask =
new TornadoExecutionPlan(
new TaskGraph("add")
.task("add", SpecialisedFloat::add, size, a, b, c)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.triadTask =
new TornadoExecutionPlan(
new TaskGraph("triad")
.task("triad", SpecialisedFloat::triad, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.nstreamTask =
new TornadoExecutionPlan(
new TaskGraph("nstream")
.task("nstream", SpecialisedFloat::nstream, size, a, b, c, scalar)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b, c)
.snapshot());
this.dotTask =
new TornadoExecutionPlan(
new TaskGraph("dot")
.task("dot", SpecialisedFloat::dot_, a, b, dotSum)
.transferToDevice(DataTransferMode.FIRST_EXECUTION, a, b)
.transferToHost(DataTransferMode.EVERY_EXECUTION, new Object[] {dotSum})
.snapshot());
}
@Override
@ -72,7 +106,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
Arrays.fill(a, config.initA);
Arrays.fill(b, config.initB);
Arrays.fill(c, config.initC);
TornadoVMStreams.xferToDevice(device, a, b, c);
TornadoVMStreams.allocAndXferToDevice(device, a, b, c);
}
@Override
@ -81,7 +115,7 @@ final class SpecialisedFloat extends GenericTornadoVMStream<Float> {
}
@Override
public Data<Float> data() {
public Data<Float> readArrays() {
TornadoVMStreams.xferFromDevice(device, a, b, c);
return new Data<>(boxed(a), boxed(b), boxed(c));
}

View File

@ -1,36 +1,46 @@
package javastream.tornadovm;
import java.util.Arrays;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import javastream.JavaStream;
import javastream.Main.Config;
import uk.ac.manchester.tornado.api.TornadoRuntimeCI;
import uk.ac.manchester.tornado.api.TornadoRuntimeInterface;
import uk.ac.manchester.tornado.api.common.Event;
import uk.ac.manchester.tornado.api.common.TornadoDevice;
import uk.ac.manchester.tornado.api.mm.TornadoGlobalObjectState;
import uk.ac.manchester.tornado.api.memory.TornadoDeviceObjectState;
import uk.ac.manchester.tornado.api.memory.TornadoGlobalObjectState;
import uk.ac.manchester.tornado.api.runtime.TornadoRuntime;
public final class TornadoVMStreams {
private TornadoVMStreams() {}
static void xferToDevice(TornadoDevice device, Object... xs) {
static void allocAndXferToDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
device.allocateObjects(
new Object[] {x}, 0, new TornadoDeviceObjectState[] {state.getDeviceState(device)});
List<Integer> writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0);
if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn());
}
}
static void xferFromDevice(TornadoDevice device, Object... xs) {
for (Object x : xs) {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
device.resolveEvent(device.streamOut(x, 0, state.getDeviceState(device), null)).waitOn();
}
Arrays.stream(xs)
.map(
x -> {
TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x);
return device.resolveEvent(
device.streamOut(x, 0, state.getDeviceState(device), null));
})
.collect(Collectors.toList())
.forEach(Event::waitOn);
}
static List<TornadoDevice> enumerateDevices(TornadoRuntimeCI runtime) {
static List<TornadoDevice> enumerateDevices(TornadoRuntimeInterface runtime) {
return IntStream.range(0, runtime.getNumDrivers())
.mapToObj(runtime::getDriver)
.flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice))

View File

@ -20,6 +20,18 @@ end
@enum Benchmark All Triad Nstream
function run_init_arrays!(data::StreamData{T,C}, context, init::Tuple{T,T,T})::Float64 where {T,C}
return @elapsed init_arrays!(data, context, init)
end
function run_read_data(data::StreamData{T,C}, context)::Tuple{Float64,VectorData{T}} where {T,C}
elapsed = @elapsed begin
result = read_data(data, context)
end
return (elapsed, result)
end
function run_all!(data::StreamData{T,C}, context, times::Int)::Tuple{Timings,T} where {T,C}
timings = Timings(times)
lastSum::T = 0
@ -39,11 +51,7 @@ function run_triad!(data::StreamData{T,C}, context, times::Int)::Float64 where {
end
end
function run_nstream!(
data::StreamData{T,C},
context,
times::Int,
)::Vector{Float64} where {T,C}
function run_nstream!(data::StreamData{T,C}, context, times::Int)::Vector{Float64} where {T,C}
timings::Vector{Float64} = zeros(times)
for i = 1:times
@inbounds timings[i] = @elapsed nstream!(data, context)
@ -93,9 +101,7 @@ function check_solutions(
error = abs((dot - gold_sum) / gold_sum)
failed = error > 1.0e-8
if failed
println(
"Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum",
)
println("Validation failed on sum. Error $error \nSum was $dot but should be $gold_sum")
end
!failed
end : true
@ -166,7 +172,7 @@ function main()
parse_options(config)
if config.list
for (i, (_,repr, impl)) in enumerate(devices())
for (i, (_, repr, impl)) in enumerate(devices())
println("[$i] ($impl) $repr")
end
exit(0)
@ -175,9 +181,7 @@ function main()
ds = devices()
# TODO implement substring device match
if config.device < 1 || config.device > length(ds)
error(
"Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed",
)
error("Device $(config.device) out of range (1..$(length(ds))), NOTE: Julia is 1-indexed")
else
device = ds[config.device]
end
@ -257,16 +261,42 @@ function main()
end
end
function show_init(init::Float64, read::Float64)
setup = [("Init", init, 3 * array_bytes), ("Read", read, 3 * array_bytes)]
if config.csv
tabulate(
map(
x -> [
("phase", x[1]),
("n_elements", config.arraysize),
("sizeof", x[3]),
("max_m$(config.mibibytes ? "i" : "")bytes_per_sec", mega_scale * total_bytes / x[2]),
("runtime", x[2]),
],
setup,
)...,
)
else
for (name, elapsed, total_bytes) in setup
println(
"$name: $(round(elapsed; digits=5)) s (=$(round(( mega_scale * total_bytes) / elapsed; digits = 5)) M$(config.mibibytes ? "i" : "")Bytes/sec)",
)
end
end
end
init::Tuple{type,type,type} = DefaultInit
scalar::type = DefaultScalar
GC.enable(false)
(data, context) = make_stream(config.arraysize, scalar, device, config.csv)
init_arrays!(data, context, init)
tInit = run_init_arrays!(data, context, init)
if benchmark == All
(timings, sum) = run_all!(data, context, config.numtimes)
valid = check_solutions(read_data(data, context), config.numtimes, init, benchmark, sum)
(tRead, result) = run_read_data(data, context)
show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, sum)
tabulate(
mk_row(timings.copy, "Copy", 2 * array_bytes),
mk_row(timings.mul, "Mul", 2 * array_bytes),
@ -276,13 +306,15 @@ function main()
)
elseif benchmark == Nstream
timings = run_nstream!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
(tRead, result) = run_read_data(data, context)
show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
tabulate(mk_row(timings, "Nstream", 4 * array_bytes))
elseif benchmark == Triad
elapsed = run_triad!(data, context, config.numtimes)
valid =
check_solutions(read_data(data, context), config.numtimes, init, benchmark, nothing)
(tRead, result) = run_read_data(data, context)
show_init(tInit, tRead)
valid = check_solutions(result, config.numtimes, init, benchmark, nothing)
total_bytes = 3 * array_bytes * config.numtimes
bandwidth = mega_scale * (total_bytes / elapsed)
println("Runtime (seconds): $(round(elapsed; digits=5))")
@ -290,7 +322,6 @@ function main()
else
error("Bad benchmark $(benchmark)")
end
GC.enable(true)
if !valid

View File

@ -306,7 +306,9 @@ void run()
#endif
auto init1 = std::chrono::high_resolution_clock::now();
stream->init_arrays(startA, startB, startC);
auto init2 = std::chrono::high_resolution_clock::now();
// Result of the Dot kernel, if used.
T sum{};
@ -333,7 +335,54 @@ void run()
std::vector<T> c(ARRAY_SIZE);
auto read1 = std::chrono::high_resolution_clock::now();
stream->read_arrays(a, b, c);
auto read2 = std::chrono::high_resolution_clock::now();
auto initElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(read2 - read1).count();
auto readElapsedS = std::chrono::duration_cast<std::chrono::duration<double>>(init2 - init1).count();
auto initBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / initElapsedS;
auto readBWps = ((mibibytes ? std::pow(2.0, -20.0) : 1.0E-6) * (3 * sizeof(T) * ARRAY_SIZE)) / readElapsedS;
if (output_as_csv)
{
std::cout
<< "phase" << csv_separator
<< "n_elements" << csv_separator
<< "sizeof" << csv_separator
<< ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator
<< "runtime" << std::endl;
std::cout
<< "Init" << csv_separator
<< ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator
<< initBWps << csv_separator
<< initElapsedS << std::endl;
std::cout
<< "Read" << csv_separator
<< ARRAY_SIZE << csv_separator
<< sizeof(T) << csv_separator
<< readBWps << csv_separator
<< readElapsedS << std::endl;
}
else
{
std::cout << "Init: "
<< std::setw(7)
<< initElapsedS
<< " s (="
<< initBWps
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
<< ")" << std::endl;
std::cout << "Read: "
<< std::setw(7)
<< readElapsedS
<< " s (="
<< readBWps
<< (mibibytes ? " MiBytes/sec" : " MBytes/sec")
<< ")" << std::endl;
}
check_solution<T>(num_times, a, b, c, sum);
// Display timing results

View File

@ -54,7 +54,7 @@ use_field_init_shorthand = false
force_explicit_abi = true
condense_wildcard_suffixes = false
color = "Auto"
required_version = "1.4.38"
required_version = "1.6.0"
unstable_features = false
disable_all_formatting = false
skip_children = false

View File

@ -174,7 +174,7 @@ where StreamData<T, D, A>: RustStream<T> {
);
}
stream.init_arrays();
let init = stream.run_init_arrays();
let tabulate = |xs: &Vec<Duration>, name: &str, t_size: usize| -> Vec<(&str, String)> {
let tail = &xs[1..]; // tail only
@ -235,10 +235,47 @@ where StreamData<T, D, A>: RustStream<T> {
};
};
let show_setup = |init: Duration, read: Duration| {
let setup = vec![
("Init", init.as_secs_f64(), 3 * array_bytes),
("Read", read.as_secs_f64(), 3 * array_bytes),
];
if option.csv {
tabulate_all(
setup
.iter()
.map(|(name, elapsed, t_size)| {
vec![
("phase", name.to_string()),
("n_elements", option.arraysize.to_string()),
("sizeof", t_size.to_string()),
(
if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" },
(mega_scale * (*t_size as f64) / elapsed).to_string(),
),
("runtime", elapsed.to_string()),
]
})
.collect::<Vec<_>>(),
);
} else {
for (name, elapsed, t_size) in setup {
println!(
"{}: {:.5} s (={:.5} {})",
name,
elapsed,
mega_scale * (t_size as f64) / elapsed,
if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }
);
}
}
};
let solutions_correct = match benchmark {
Benchmark::All => {
let (results, sum) = stream.run_all(option.numtimes);
stream.read_arrays();
let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum));
tabulate_all(vec![
tabulate(&results.copy, "Copy", 2 * array_bytes),
@ -251,14 +288,16 @@ where StreamData<T, D, A>: RustStream<T> {
}
Benchmark::NStream => {
let results = stream.run_nstream(option.numtimes);
stream.read_arrays();
let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, None);
tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]);
correct
}
Benchmark::Triad => {
let results = stream.run_triad(option.numtimes);
stream.read_arrays();
let read = stream.run_read_arrays();
show_setup(init, read);
let correct = check_solution(benchmark, option.numtimes, &stream, None);
let total_bytes = 3 * array_bytes * option.numtimes;
let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64());

View File

@ -132,6 +132,18 @@ pub trait RustStream<T: Default> {
fn nstream(&mut self);
fn dot(&mut self) -> T;
fn run_init_arrays(&mut self) -> Duration {
timed(|| {
self.init_arrays();
})
}
fn run_read_arrays(&mut self) -> Duration {
timed(|| {
self.read_arrays();
})
}
fn run_all(&mut self, n: usize) -> (AllTiming<Vec<Duration>>, T) {
let mut timings: AllTiming<Vec<Duration>> = AllTiming {
copy: vec![Duration::default(); n],

View File

@ -2,10 +2,10 @@ use rstest::rstest;
#[rstest]
fn test_main(
#[values(0, 1, 2, 3, 4)] device: usize, //
#[values("", "--pin")] pin: &str, //
#[values("", "--malloc")] malloc: &str, //
#[values("", "--init")] init: &str, //
#[values(0, 1, 2, 3, 4)] device: usize, //
#[values("", "--pin")] pin: &str, //
#[values("", "--malloc")] malloc: &str, //
#[values("", "--init")] init: &str, //
#[values("", "--triad-only", "--nstream-only")] option: &str, //
) {
let line = format!(

View File

@ -1 +0,0 @@
{"name":"sbt","version":"1.5.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/usr/lib/jvm/java-11-openjdk-11.0.11.0.9-2.fc33.x86_64/bin/java","-Xms100m","-Xmx100m","-classpath","/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/home/tom/.local/share/JetBrains/Toolbox/apps/IDEA-U/ch-0/211.7142.45.plugins/Scala/launcher/sbt-launch.jar"]}

View File

@ -1 +1,2 @@
target/
.bsp/

View File

@ -1,4 +1,4 @@
version = "3.0.0-RC2"
version = "3.7.14"
runner.dialect = scala3
style = defaultWithAlign

View File

@ -3,7 +3,7 @@ lazy val mainCls = Some("scalastream.App")
lazy val root = (project in file("."))
.enablePlugins(NativeImagePlugin)
.settings(
scalaVersion := "3.0.0",
scalaVersion := "3.3.1",
version := "4.0",
organization := "uk.ac.bristol.uob-hpc",
organizationName := "University of Bristol",
@ -11,6 +11,11 @@ lazy val root = (project in file("."))
assembly / mainClass := mainCls,
scalacOptions ~= filterConsoleScalacOptions,
assembly / assemblyJarName := "scala-stream.jar",
assembly / assemblyMergeStrategy := {
case PathList("module-info.class") => MergeStrategy.discard
case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard
case x => (ThisBuild / assemblyMergeStrategy).value(x)
},
nativeImageOptions := Seq(
"--no-fallback",
"-H:ReflectionConfigurationFiles=../../reflect-config.json"
@ -22,8 +27,8 @@ lazy val root = (project in file("."))
// Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part
("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13),
// par also uses lazy val at some point, so it doesn't work in nativeImage
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.3",
"net.openhft" % "affinity" % "3.21ea1",
"org.slf4j" % "slf4j-simple" % "1.7.30" // for affinity
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4",
"net.openhft" % "affinity" % "3.23.2",
"org.slf4j" % "slf4j-simple" % "2.0.5" // for affinity
)
)

View File

@ -1 +1 @@
sbt.version=1.5.2
sbt.version=1.9.2

View File

@ -1,6 +1,6 @@
addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3")
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.17")
addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.20")
addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3")
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3")

View File

@ -14,6 +14,7 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
def config: Config[A]
def initArrays(): Unit
def readArrays(): Unit = ()
def copy(): Unit
def mul(): Unit
def add(): Unit
@ -27,6 +28,8 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
val end = System.nanoTime()
FiniteDuration(end - start, TimeUnit.NANOSECONDS) -> r
inline def runInitArrays(): FiniteDuration = timed(initArrays())._1
inline def runReadArrays(): FiniteDuration = timed(readArrays())._1
inline def runAll(times: Int)(using Fractional[A]): (Timings[Vector[FiniteDuration]], A) =
val copy = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
val mul = ArrayBuffer.fill[FiniteDuration](times)(Duration.Zero)
@ -62,7 +65,6 @@ transparent trait ScalaStream[@specialized(Float, Double) A]:
def data(): Data[A]
trait Fractional[@specialized(Double, Float) A]:
def toFractional(f: Float): A
def toFractional(f: Double): A
@ -77,13 +79,13 @@ trait Fractional[@specialized(Double, Float) A]:
extension (x: Int) inline def fractional = toFractional(x.toFloat)
extension (x: Long) inline def fractional = toFractional(x.toDouble)
extension (x: A)
inline def +(y: A) = add(x, y)
inline def -(y: A) = sub(x, y)
inline def *(y: A) = mul(x, y)
inline def /(y: A) = div(x, y)
inline def >(y: A) = compare(x, y) > 0
inline def <(y: A) = compare(x, y) < 0
inline def abs_ = abs(x)
inline def +(y: A) = add(x, y)
inline def -(y: A) = sub(x, y)
inline def *(y: A) = mul(x, y)
inline def /(y: A) = div(x, y)
inline def >(y: A) = compare(x, y) > 0
inline def <(y: A) = compare(x, y) < 0
inline def abs_ = abs(x)
end Fractional
given FloatFractional: Fractional[Float] with
@ -204,7 +206,7 @@ object App:
validateXs("c", vec.c, goldC)
dotSum.foreach { sum =>
val goldSum = (goldA * goldB) * (config.options.arraysize).fractional
val goldSum = (goldA * goldB) * config.options.arraysize.fractional
val error = ((sum - goldSum) / goldSum).abs_
if error > 1.fractional / 100000000.fractional then
Console.err.println(
@ -238,10 +240,10 @@ object App:
)
println(s"Running ${config.benchmark match {
case Benchmark.All => "kernels"
case Benchmark.Triad => "triad"
case Benchmark.NStream => "nstream"
}} ${opt.numtimes} times")
case Benchmark.All => "kernels"
case Benchmark.Triad => "triad"
case Benchmark.NStream => "nstream"
}} ${opt.numtimes} times")
if config.benchmark == Benchmark.Triad then println(s"Number of elements: ${opt.arraysize}")
@ -288,11 +290,38 @@ object App:
println(header.map(_._1.padTo(padding, ' ')).mkString(sep))
println(rows.map(_.map(_._2.padTo(padding, ' ')).mkString(sep)).mkString("\n"))
def showInit(init: FiniteDuration, read: FiniteDuration): Unit = {
val setup =
Vector(("Init", init.seconds, 3 * arrayBytes), ("Read", read.seconds, 3 * arrayBytes))
if opt.csv then
tabulate(
setup.map((name, elapsed, totalBytes) =>
Vector(
"phase" -> name,
"n_elements" -> opt.arraysize.toString,
"sizeof" -> arrayBytes.toString,
s"max_m${if opt.mibibytes then "i" else ""}bytes_per_sec" ->
(megaScale * totalBytes.toDouble / elapsed).toString,
"runtime" -> elapsed.toString
)
): _*
)
else
for (name, elapsed, totalBytes) <- setup do
println(
f"$name: $elapsed%.5f s (=${megaScale * totalBytes.toDouble / elapsed}%.5f M${
if opt.mibibytes then "i" else ""
}Bytes/sec)"
)
}
val stream = mkStream(config)
stream.initArrays()
val init = stream.runInitArrays()
config.benchmark match
case Benchmark.All =>
val (results, sum) = stream.runAll(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
validate(stream.data(), config, Some(sum))
tabulate(
mkRow(results.copy, "Copy", 2 * arrayBytes),
@ -303,10 +332,14 @@ object App:
)
case Benchmark.NStream =>
val result = stream.runNStream(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
validate(stream.data(), config)
tabulate(mkRow(result, "Nstream", 4 * arrayBytes))
case Benchmark.Triad =>
val results = stream.runTriad(opt.numtimes)
val results = stream.runTriad(opt.numtimes)
val read = stream.runReadArrays()
showInit(init, read)
val totalBytes = 3 * arrayBytes * opt.numtimes
val bandwidth = megaScale * (totalBytes / results.seconds)
println(f"Runtime (seconds): ${results.seconds}%.5f")