From ce4d6cfbfb0ecb898acf8218c836b6126dcb3bd9 Mon Sep 17 00:00:00 2001
From: Tom Lin <tom91136@gmail.com>
Date: Wed, 16 Jun 2021 01:11:40 +0100
Subject: [PATCH] Add integration tests and CI Fix wrong nstream in
 plain_stream

---
 .github/workflows/main.yaml           |  15 +
 rust-stream/Cargo.lock                |  56 ++++
 rust-stream/Cargo.toml                |   3 +
 rust-stream/src/lib.rs                | 430 ++++++++++++++++++++++++++
 rust-stream/src/main.rs               | 413 +------------------------
 rust-stream/src/plain_stream.rs       |   2 +-
 rust-stream/tests/integration_test.rs |  17 +
 7 files changed, 524 insertions(+), 412 deletions(-)
 create mode 100644 rust-stream/src/lib.rs
 create mode 100644 rust-stream/tests/integration_test.rs

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 20e1034..274df60 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -3,6 +3,21 @@ on: [push, pull_request]
 
 
 jobs:
+  test-rust:
+    runs-on: ubuntu-18.04
+    defaults:
+      run:
+        working-directory: ./rust-stream
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup project
+        run: rustup install nightly
+      - name: Compile project
+        run: cargo +nightly build --release
+      - name: Test project
+        run: cargo +nightly test --release
+      - name: Test run project
+        run: ./target/release/rust-stream --arraysize 2048
   test:
     runs-on: ubuntu-18.04
     steps:
diff --git a/rust-stream/Cargo.lock b/rust-stream/Cargo.lock
index eec5b71..7c5ec13 100644
--- a/rust-stream/Cargo.lock
+++ b/rust-stream/Cargo.lock
@@ -326,6 +326,15 @@ dependencies = [
  "winapi 0.3.9",
 ]
 
+[[package]]
+name = "pest"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
+dependencies = [
+ "ucd-trie",
+]
+
 [[package]]
 name = "proc-macro-error"
 version = "1.0.4"
@@ -402,6 +411,19 @@ dependencies = [
  "bitflags",
 ]
 
+[[package]]
+name = "rstest"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af"
+dependencies = [
+ "cfg-if",
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "syn",
+]
+
 [[package]]
 name = "rust-stream"
 version = "3.4.0"
@@ -413,11 +435,21 @@ dependencies = [
  "num-traits",
  "num_cpus",
  "rayon",
+ "rstest",
  "rustversion",
  "structopt",
  "tabular",
 ]
 
+[[package]]
+name = "rustc_version"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee"
+dependencies = [
+ "semver",
+]
+
 [[package]]
 name = "rustversion"
 version = "1.0.5"
@@ -430,6 +462,24 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
+[[package]]
+name = "semver"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6"
+dependencies = [
+ "semver-parser",
+]
+
+[[package]]
+name = "semver-parser"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
+dependencies = [
+ "pest",
+]
+
 [[package]]
 name = "signal-hook"
 version = "0.1.17"
@@ -515,6 +565,12 @@ dependencies = [
  "unicode-width",
 ]
 
+[[package]]
+name = "ucd-trie"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
+
 [[package]]
 name = "unicode-segmentation"
 version = "1.7.1"
diff --git a/rust-stream/Cargo.toml b/rust-stream/Cargo.toml
index 55db62b..f0365a6 100644
--- a/rust-stream/Cargo.toml
+++ b/rust-stream/Cargo.toml
@@ -18,6 +18,9 @@ libc = "0.2.97"
 core_affinity = "0.5.10"
 colour = "0.6.0"
 
+[dev-dependencies]
+rstest = "0.10.0"
+
 [build-dependencies]
 rustversion = "1.0"
 
diff --git a/rust-stream/src/lib.rs b/rust-stream/src/lib.rs
new file mode 100644
index 0000000..953858e
--- /dev/null
+++ b/rust-stream/src/lib.rs
@@ -0,0 +1,430 @@
+#![feature(allocator_api)]
+#![feature(vec_into_raw_parts)]
+
+use std::alloc::System;
+use std::fmt::{Debug, Display};
+use std::iter::Sum;
+use std::mem::size_of;
+use std::time::Duration;
+
+use num_traits::abs;
+use structopt::StructOpt;
+use tabular::{Row, Table};
+
+use crate::crossbeam_stream::ThreadedDevice;
+use crate::plain_stream::SerialDevice;
+use crate::rayon_stream::RayonDevice;
+use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData};
+
+mod crossbeam_stream;
+mod plain_stream;
+mod rayon_stream;
+mod stream;
+
+#[derive(Debug, StructOpt)]
+struct Options {
+  /// List available devices
+  #[structopt(long)]
+  list: bool,
+  /// Select device at <device>
+  #[structopt(long, default_value = "0")]
+  device: usize,
+  /// Run the test <numtimes> times (NUM >= 2)
+  #[structopt(long, short = "n", default_value = "100")]
+  numtimes: usize,
+  /// Use <arraysize> elements in the array
+  #[structopt(long, short = "s", default_value = "33554432")]
+  arraysize: usize,
+  /// Use floats (rather than doubles)
+  #[structopt(long)]
+  float: bool,
+  /// Only run triad
+  #[structopt(long)]
+  triad_only: bool,
+  /// Only run nstream
+  #[structopt(long)]
+  nstream_only: bool,
+  /// Output as csv table
+  #[structopt(long)]
+  csv: bool,
+  /// Use MiB=2^20 for bandwidth calculation (default MB=10^6)
+  #[structopt(long)]
+  mibibytes: bool,
+  /// Use libc malloc instead of the Rust's allocator for benchmark array allocation
+  #[structopt(name = "malloc", long)]
+  malloc: bool,
+  /// Initialise each benchmark array at allocation time on the main thread
+  #[structopt(name = "init", long)]
+  init: bool,
+  /// Pin threads to distinct cores, this has NO effect in Rayon devices
+  #[structopt(long)]
+  pin: bool,
+}
+
+#[derive(PartialEq)]
+enum Benchmark {
+  All,
+  Triad,
+  NStream,
+}
+
+fn check_solution<T: ArrayType + Display + Sum + Into<f64>, D, A: AllocatorType>(
+  benchmark: Benchmark, numtimes: usize, vec: &StreamData<T, D, A>, dot_sum: Option<T>,
+) -> bool {
+  let (mut gold_a, mut gold_b, mut gold_c) = vec.init;
+  for _ in 0..numtimes {
+    match benchmark {
+      Benchmark::All => {
+        gold_c = gold_a;
+        gold_b = vec.scalar * gold_c;
+        gold_c = gold_a + gold_b;
+        gold_a = gold_b + vec.scalar * gold_c;
+      }
+      Benchmark::Triad => {
+        gold_a = gold_b + vec.scalar * gold_c;
+      }
+      Benchmark::NStream => {
+        gold_a += gold_b + vec.scalar * gold_c;
+      }
+    };
+  }
+  let tolerance = T::epsilon().into() * 100.0f64;
+  let validate_xs = |name: &str, xs: &Vec<T, A>, from: T| {
+    let error = (xs.iter().map(|x| abs(*x - from)).sum::<T>()).into() / xs.len() as f64;
+    let fail = error > tolerance;
+    if fail {
+      eprintln!("Validation failed on {}[]. Average error {} ", name, error);
+    }
+    !fail
+  };
+  let a_ok = validate_xs("a", &vec.a, gold_a);
+  let b_ok = validate_xs("b", &vec.b, gold_b);
+  let c_ok = validate_xs("c", &vec.c, gold_c);
+  let dot_ok = dot_sum.map_or(true, |sum| {
+    let gold_sum = (gold_a * gold_b).into() * vec.size as f64;
+    let error = abs((sum.into() - gold_sum) / gold_sum);
+    let fail = error > 1.0e-8;
+    if fail {
+      eprintln!(
+        "Validation failed on sum. Error {} \nSum was {} but should be {}",
+        error, sum, gold_sum
+      );
+    }
+    !fail
+  });
+
+  a_ok && b_ok && c_ok && dot_ok
+}
+
+fn run_cpu<T: ArrayType + Sync + Send + Sum + Into<f64> + Display, D, A: AllocatorType>(
+  option: &Options, mut stream: StreamData<T, D, A>,
+) -> bool
+where
+  StreamData<T, D, A>: RustStream<T>,
+{
+  let benchmark = match (option.nstream_only, option.triad_only) {
+    (true, false) => Benchmark::NStream,
+    (false, true) => Benchmark::Triad,
+    (false, false) => Benchmark::All,
+    (true, true) => {
+      panic!("Both triad and nstream are enabled, pick one or omit both to run all benchmarks")
+    }
+  };
+
+  let array_bytes = option.arraysize * size_of::<T>();
+  let total_bytes = array_bytes * 3;
+  let (mega_scale, mega_suffix, giga_scale, giga_suffix) = if !option.mibibytes {
+    (1.0e-6, "MB", 1.0e-9, "GB")
+  } else {
+    (2f64.powi(-20), "MiB", 2f64.powi(-30), "GiB")
+  };
+
+  if !option.csv {
+    println!(
+      "Running {} {} times",
+      match benchmark {
+        Benchmark::All => "kernels",
+        Benchmark::Triad => "triad",
+        Benchmark::NStream => "nstream",
+      },
+      option.numtimes
+    );
+
+    if benchmark == Benchmark::Triad {
+      println!("Number of elements: {}", option.arraysize);
+    }
+
+    println!("Precision: {}", if option.float { "float" } else { "double" });
+    println!(
+      "Array size: {:.1} {} (={:.1} {})",
+      mega_scale * array_bytes as f64,
+      mega_suffix,
+      giga_scale * array_bytes as f64,
+      giga_suffix
+    );
+    println!(
+      "Total size: {:.1} {} (={:.1} {})",
+      mega_scale * total_bytes as f64,
+      mega_suffix,
+      giga_scale * total_bytes as f64,
+      giga_suffix
+    );
+  }
+
+  stream.init_arrays();
+
+  let tabulate = |xs: &Vec<Duration>, name: &str, t_size: usize| -> Vec<(&str, String)> {
+    let tail = &xs[1..]; // tail only
+                         // do stats
+    let max = tail.iter().max().map(|d| d.as_secs_f64());
+    let min = tail.iter().min().map(|d| d.as_secs_f64());
+    match (min, max) {
+      (Some(min), Some(max)) => {
+        let avg: f64 = tail.iter().map(|d| d.as_secs_f64()).sum::<f64>() / tail.len() as f64;
+        let mbps = mega_scale * (t_size as f64) / min;
+        if option.csv {
+          vec![
+            ("function", name.to_string()),
+            ("num_times", option.numtimes.to_string()),
+            ("n_elements", option.arraysize.to_string()),
+            ("sizeof", t_size.to_string()),
+            (
+              if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" },
+              mbps.to_string(),
+            ),
+            ("min_runtime", min.to_string()),
+            ("max_runtime", max.to_string()),
+            ("avg_runtime", avg.to_string()),
+          ]
+        } else {
+          vec![
+            ("Function", name.to_string()),
+            (if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }, format!("{:.3}", mbps)),
+            ("Min (sec)", format!("{:.5}", min)),
+            ("Max", format!("{:.5}", max)),
+            ("Average", format!("{:.5}", avg)),
+          ]
+        }
+      }
+      (_, _) => panic!("No min/max element for {}(size={})", name, t_size),
+    }
+  };
+
+  let tabulate_all = |xs: Vec<Vec<(&str, String)>>| {
+    match xs.as_slice() {
+      [head, ..] => {
+        if option.csv {
+          println!("{}", head.iter().map(|(col, _)| *col).collect::<Vec<_>>().join(","));
+          for kvs in xs {
+            println!("{}", kvs.iter().map(|(_, val)| val.clone()).collect::<Vec<_>>().join(","));
+          }
+        } else {
+          let mut table = Table::new(&vec!["{:<}"; head.len()].join("    "));
+          table.add_row(head.iter().fold(Row::new(), |row, (col, _)| row.with_cell(col)));
+          for kvs in xs {
+            table.add_row(kvs.iter().fold(Row::new(), |row, (_, val)| row.with_cell(val)));
+          }
+          print!("{}", table);
+        }
+      }
+      _ => panic!("Empty tabulation"),
+    };
+  };
+
+  let solutions_correct = match benchmark {
+    Benchmark::All => {
+      let (results, sum) = stream.run_all(option.numtimes);
+      let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum));
+      tabulate_all(vec![
+        tabulate(&results.copy, "Copy", 2 * array_bytes),
+        tabulate(&results.mul, "Mul", 2 * array_bytes),
+        tabulate(&results.add, "Add", 3 * array_bytes),
+        tabulate(&results.triad, "Triad", 3 * array_bytes),
+        tabulate(&results.dot, "Dot", 2 * array_bytes),
+      ]);
+      correct
+    }
+    Benchmark::NStream => {
+      let results = stream.run_nstream(option.numtimes);
+      let correct = check_solution(benchmark, option.numtimes, &stream, None);
+      tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]);
+      correct
+    }
+    Benchmark::Triad => {
+      let results = stream.run_triad(option.numtimes);
+      let correct = check_solution(benchmark, option.numtimes, &stream, None);
+      let total_bytes = 3 * array_bytes * option.numtimes;
+      let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64());
+      println!("Runtime (seconds): {:.5}", results.as_secs_f64());
+      println!("Bandwidth ({}/s): {:.3} ", giga_suffix, bandwidth);
+      correct
+    }
+  };
+  &stream.clean_up();
+  solutions_correct
+}
+
+const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION");
+
+static START_A: f32 = 0.1;
+static START_B: f32 = 0.2;
+static START_C: f32 = 0.0;
+static START_SCALAR: f32 = 0.4;
+
+static FLOAT_INIT_SCALAR: f32 = START_SCALAR;
+static FLOAT_INIT: (f32, f32, f32) = (START_A, START_B, START_C);
+
+static DOUBLE_START_SCALAR: f64 = START_SCALAR as f64;
+static DOUBLE_INIT: (f64, f64, f64) = (START_A as f64, START_B as f64, START_C as f64);
+
+pub fn run(args: &Vec<String>) -> bool {
+  println!("`{:?}`", args);
+
+  let options: Options = Options::from_iter(args);
+
+  if options.numtimes < 2 {
+    panic!("numtimes must be >= 2")
+  }
+
+  let alloc = System;
+  let alloc_name = if options.malloc { "libc-malloc" } else { "rust-system" };
+
+  let rayon_device = &|| {
+    let dev = RayonDevice { pool: rayon::ThreadPoolBuilder::default().build().unwrap() };
+    if !options.csv {
+      println!("Using {} thread(s), alloc={}", dev.pool.current_num_threads(), alloc_name);
+      if options.pin {
+        colour::e_yellow_ln!("Pinning threads have no effect on Rayon!")
+      }
+    }
+    if options.float {
+      run_cpu(
+        &options,
+        StreamData::new_in(
+          options.arraysize,
+          FLOAT_INIT_SCALAR,
+          FLOAT_INIT,
+          dev,
+          alloc,
+          options.malloc,
+          options.init,
+        ),
+      )
+    } else {
+      run_cpu(
+        &options,
+        StreamData::new_in(
+          options.arraysize,
+          DOUBLE_START_SCALAR,
+          DOUBLE_INIT,
+          dev,
+          alloc,
+          options.malloc,
+          options.init,
+        ),
+      )
+    }
+  };
+
+  let crossbeam_device = &|| {
+    let ncores = num_cpus::get();
+    let dev = ThreadedDevice::new(ncores, options.pin);
+    if !options.csv {
+      println!("Using {} thread(s), pin={}, alloc={}", ncores, options.pin, alloc_name)
+    }
+    if options.float {
+      run_cpu(
+        &options,
+        StreamData::new_in(
+          options.arraysize,
+          FLOAT_INIT_SCALAR,
+          FLOAT_INIT,
+          dev,
+          alloc,
+          options.malloc,
+          options.init,
+        ),
+      )
+    } else {
+      run_cpu(
+        &options,
+        StreamData::new_in(
+          options.arraysize,
+          DOUBLE_START_SCALAR,
+          DOUBLE_INIT,
+          dev,
+          alloc,
+          options.malloc,
+          options.init,
+        ),
+      )
+    }
+  };
+  let st_device = &|| {
+    let dev = SerialDevice { pin: options.pin };
+    if !options.csv {
+      println!("Using 1 thread, pin={}, alloc={}", options.pin, alloc_name);
+    }
+    if options.float {
+      run_cpu(
+        &options,
+        StreamData::new_in(
+          options.arraysize,
+          FLOAT_INIT_SCALAR,
+          FLOAT_INIT,
+          dev,
+          alloc,
+          options.malloc,
+          options.init,
+        ),
+      )
+    } else {
+      run_cpu(
+        &options,
+        StreamData::new_in(
+          options.arraysize,
+          DOUBLE_START_SCALAR,
+          DOUBLE_INIT,
+          dev,
+          alloc,
+          options.malloc,
+          options.init,
+        ),
+      )
+    }
+  };
+  let devices: Vec<(String, &'_ dyn Fn() -> bool)> = vec![
+    ("CPU (Rayon)".to_string(), rayon_device),
+    (format!("CPU (Crossbeam, pinning={})", options.pin), crossbeam_device),
+    ("CPU (Single threaded)".to_string(), st_device),
+  ];
+
+  if options.list {
+    devices.iter().enumerate().for_each(|(i, (name, _))| {
+      println!("[{}] {}", i, name);
+    });
+    true
+  } else {
+    match devices.get(options.device) {
+      Some((name, run)) => {
+        if !&options.csv {
+          println!(
+            "BabelStream\n\
+                              Version: {}\n\
+                              Implementation: Rust; {}",
+            VERSION.unwrap_or("unknown"),
+            name
+          );
+          if options.init {
+            println!("Initialising arrays on main thread");
+          }
+        }
+        run()
+      }
+      None => {
+        eprintln!("Device index {} not available", options.device);
+        false
+      }
+    }
+  }
+}
diff --git a/rust-stream/src/main.rs b/rust-stream/src/main.rs
index 8f8f43c..8c99087 100644
--- a/rust-stream/src/main.rs
+++ b/rust-stream/src/main.rs
@@ -1,414 +1,5 @@
-#![feature(allocator_api)]
-#![feature(vec_into_raw_parts)]
-
-use std::alloc::System;
-use std::fmt::{Debug, Display};
-use std::iter::Sum;
-use std::mem::size_of;
-use std::time::Duration;
-
-use num_traits::abs;
-use structopt::StructOpt;
-use tabular::{Row, Table};
-
-use crate::crossbeam_stream::ThreadedDevice;
-use crate::plain_stream::SerialDevice;
-use crate::rayon_stream::RayonDevice;
-use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData};
-
-mod crossbeam_stream;
-mod plain_stream;
-mod rayon_stream;
-mod stream;
-
-#[derive(Debug, StructOpt)]
-struct Options {
-  /// List available devices
-  #[structopt(long)]
-  list: bool,
-  /// Select device at <device>
-  #[structopt(long, default_value = "0")]
-  device: usize,
-  /// Run the test <numtimes> times (NUM >= 2)
-  #[structopt(long, short = "n", default_value = "100")]
-  numtimes: usize,
-  /// Use <arraysize> elements in the array
-  #[structopt(long, short = "s", default_value = "33554432")]
-  arraysize: usize,
-  /// Use floats (rather than doubles)
-  #[structopt(long)]
-  float: bool,
-  /// Only run triad
-  #[structopt(long)]
-  triad_only: bool,
-  /// Only run nstream
-  #[structopt(long)]
-  nstream_only: bool,
-  /// Output as csv table
-  #[structopt(long)]
-  csv: bool,
-  /// Use MiB=2^20 for bandwidth calculation (default MB=10^6)
-  #[structopt(long)]
-  mibibytes: bool,
-  /// Use libc malloc instead of the Rust's allocator for benchmark array allocation
-  #[structopt(name = "malloc", long)]
-  malloc: bool,
-  /// Initialise each benchmark array at allocation time on the main thread
-  #[structopt(name = "init", long)]
-  init: bool,
-  /// Pin threads to distinct cores, this has NO effect in Rayon devices
-  #[structopt(long)]
-  pin: bool,
-}
-
-#[derive(PartialEq)]
-enum Benchmark {
-  All,
-  Triad,
-  NStream,
-}
-
-fn check_solution<T: ArrayType + Display + Sum + Into<f64>, D, A: AllocatorType>(
-  benchmark: Benchmark, numtimes: usize, vec: &StreamData<T, D, A>, dot_sum: Option<T>,
-) {
-  let (mut gold_a, mut gold_b, mut gold_c) = vec.init;
-  for _ in 0..numtimes {
-    match benchmark {
-      Benchmark::All => {
-        gold_c = gold_a;
-        gold_b = vec.scalar * gold_c;
-        gold_c = gold_a + gold_b;
-        gold_a = gold_b + vec.scalar * gold_c;
-      }
-      Benchmark::Triad => {
-        gold_a = gold_b + vec.scalar * gold_c;
-      }
-      Benchmark::NStream => {
-        gold_a += gold_b + vec.scalar * gold_c;
-      }
-    };
-  }
-  let tolerance = T::epsilon().into() * 100.0f64;
-  let validate_xs = |name: &str, xs: &Vec<T, A>, from: T| {
-    let error = (xs.iter().map(|x| abs(*x - from)).sum::<T>()).into() / xs.len() as f64;
-    if error > tolerance {
-      eprintln!("Validation failed on {}[]. Average error {} ", name, error)
-    }
-  };
-  validate_xs("a", &vec.a, gold_a);
-  validate_xs("b", &vec.b, gold_b);
-  validate_xs("c", &vec.c, gold_c);
-
-  if let Some(sum) = dot_sum {
-    let gold_sum = (gold_a * gold_b).into() * vec.size as f64;
-    let error = abs((sum.into() - gold_sum) / gold_sum);
-    if error > 1.0e-8 {
-      eprintln!(
-        "Validation failed on sum. Error {} \nSum was {} but should be {}",
-        error, sum, gold_sum
-      );
-    }
-  }
-}
-
-fn run_cpu<T: ArrayType + Sync + Send + Sum + Into<f64> + Display, D, A: AllocatorType>(
-  option: &Options, mut stream: StreamData<T, D, A>,
-) where
-  StreamData<T, D, A>: RustStream<T>,
-{
-  let benchmark = match (option.nstream_only, option.triad_only) {
-    (true, false) => Benchmark::NStream,
-    (false, true) => Benchmark::Triad,
-    (false, false) => Benchmark::All,
-    (true, true) => {
-      panic!("Both triad and nstream are enabled, pick one or omit both to run all benchmarks")
-    }
-  };
-
-  let array_bytes = option.arraysize * size_of::<T>();
-  let total_bytes = array_bytes * 3;
-  let (mega_scale, mega_suffix, giga_scale, giga_suffix) = if !option.mibibytes {
-    (1.0e-6, "MB", 1.0e-9, "GB")
-  } else {
-    (2f64.powi(-20), "MiB", 2f64.powi(-30), "GiB")
-  };
-
-  if !option.csv {
-    println!(
-      "Running {} {} times",
-      match benchmark {
-        Benchmark::All => "kernels",
-        Benchmark::Triad => "triad",
-        Benchmark::NStream => "nstream",
-      },
-      option.numtimes
-    );
-
-    if benchmark == Benchmark::Triad {
-      println!("Number of elements: {}", option.arraysize);
-    }
-
-    println!("Precision: {}", if option.float { "float" } else { "double" });
-    println!(
-      "Array size: {:.1} {} (={:.1} {})",
-      mega_scale * array_bytes as f64,
-      mega_suffix,
-      giga_scale * array_bytes as f64,
-      giga_suffix
-    );
-    println!(
-      "Total size: {:.1} {} (={:.1} {})",
-      mega_scale * total_bytes as f64,
-      mega_suffix,
-      giga_scale * total_bytes as f64,
-      giga_suffix
-    );
-  }
-
-  stream.init_arrays();
-
-  let tabulate = |xs: &Vec<Duration>, name: &str, t_size: usize| -> Vec<(&str, String)> {
-    let tail = &xs[1..]; // tail only
-                         // do stats
-    let max = tail.iter().max().map(|d| d.as_secs_f64());
-    let min = tail.iter().min().map(|d| d.as_secs_f64());
-    match (min, max) {
-      (Some(min), Some(max)) => {
-        let avg: f64 = tail.iter().map(|d| d.as_secs_f64()).sum::<f64>() / tail.len() as f64;
-        let mbps = mega_scale * (t_size as f64) / min;
-        if option.csv {
-          vec![
-            ("function", name.to_string()),
-            ("num_times", option.numtimes.to_string()),
-            ("n_elements", option.arraysize.to_string()),
-            ("sizeof", t_size.to_string()),
-            (
-              if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" },
-              mbps.to_string(),
-            ),
-            ("min_runtime", min.to_string()),
-            ("max_runtime", max.to_string()),
-            ("avg_runtime", avg.to_string()),
-          ]
-        } else {
-          vec![
-            ("Function", name.to_string()),
-            (if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }, format!("{:.3}", mbps)),
-            ("Min (sec)", format!("{:.5}", min)),
-            ("Max", format!("{:.5}", max)),
-            ("Average", format!("{:.5}", avg)),
-          ]
-        }
-      }
-      (_, _) => panic!("No min/max element for {}(size={})", name, t_size),
-    }
-  };
-
-  let tabulate_all = |xs: Vec<Vec<(&str, String)>>| {
-    match xs.as_slice() {
-      [head, ..] => {
-        if option.csv {
-          println!("{}", head.iter().map(|(col, _)| *col).collect::<Vec<_>>().join(","));
-          for kvs in xs {
-            println!("{}", kvs.iter().map(|(_, val)| val.clone()).collect::<Vec<_>>().join(","));
-          }
-        } else {
-          let mut table = Table::new(&vec!["{:<}"; head.len()].join("    "));
-          table.add_row(head.iter().fold(Row::new(), |row, (col, _)| row.with_cell(col)));
-          for kvs in xs {
-            table.add_row(kvs.iter().fold(Row::new(), |row, (_, val)| row.with_cell(val)));
-          }
-          print!("{}", table);
-        }
-      }
-      _ => panic!("Empty tabulation"),
-    };
-  };
-
-  match benchmark {
-    Benchmark::All => {
-      let (results, sum) = stream.run_all(option.numtimes);
-      check_solution(benchmark, option.numtimes, &stream, Some(sum));
-      tabulate_all(vec![
-        tabulate(&results.copy, "Copy", 2 * array_bytes),
-        tabulate(&results.mul, "Mul", 2 * array_bytes),
-        tabulate(&results.add, "Add", 3 * array_bytes),
-        tabulate(&results.triad, "Triad", 3 * array_bytes),
-        tabulate(&results.dot, "Dot", 2 * array_bytes),
-      ])
-    }
-    Benchmark::NStream => {
-      let results = stream.run_nstream(option.numtimes);
-      check_solution(benchmark, option.numtimes, &stream, None);
-      tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]);
-    }
-    Benchmark::Triad => {
-      let results = stream.run_triad(option.numtimes);
-      let total_bytes = 3 * array_bytes * option.numtimes;
-      let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64());
-
-      println!("Runtime (seconds): {:.5}", results.as_secs_f64());
-      println!("Bandwidth ({}/s): {:.3} ", giga_suffix, bandwidth);
-    }
-  };
-  &stream.clean_up();
-}
-
-const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION");
-
-static START_A: f32 = 0.1;
-static START_B: f32 = 0.2;
-static START_C: f32 = 0.0;
-static START_SCALAR: f32 = 0.4;
-
-static FLOAT_INIT_SCALAR: f32 = START_SCALAR;
-static FLOAT_INIT: (f32, f32, f32) = (START_A, START_B, START_C);
-
-static DOUBLE_START_SCALAR: f64 = START_SCALAR as f64;
-static DOUBLE_INIT: (f64, f64, f64) = (START_A as f64, START_B as f64, START_C as f64);
-
 fn main() {
-  let options: Options = Options::from_args();
-
-  if options.numtimes < 2 {
-    panic!("numtimes must be >= 2")
-  }
-
-  let alloc = System;
-  let alloc_name = if options.malloc { "libc-malloc" } else { "rust-system" };
-
-  let rayon_device = &|| {
-    let dev = RayonDevice { pool: rayon::ThreadPoolBuilder::default().build().unwrap() };
-    if !options.csv {
-      println!("Using {} thread(s), alloc={}", dev.pool.current_num_threads(), alloc_name);
-      if options.pin {
-        colour::e_yellow_ln!("Pinning threads have no effect on Rayon!")
-      }
-    }
-    if options.float {
-      run_cpu(
-        &options,
-        StreamData::new_in(
-          options.arraysize,
-          FLOAT_INIT_SCALAR,
-          FLOAT_INIT,
-          dev,
-          alloc,
-          options.malloc,
-          options.init,
-        ),
-      );
-    } else {
-      run_cpu(
-        &options,
-        StreamData::new_in(
-          options.arraysize,
-          DOUBLE_START_SCALAR,
-          DOUBLE_INIT,
-          dev,
-          alloc,
-          options.malloc,
-          options.init,
-        ),
-      );
-    }
-  };
-
-  let crossbeam_device = &|| {
-    let ncores = num_cpus::get();
-    let dev = ThreadedDevice::new(ncores, options.pin);
-    if !options.csv {
-      println!("Using {} thread(s), pin={}, alloc={}", ncores, options.pin, alloc_name)
-    }
-    if options.float {
-      run_cpu(
-        &options,
-        StreamData::new_in(
-          options.arraysize,
-          FLOAT_INIT_SCALAR,
-          FLOAT_INIT,
-          dev,
-          alloc,
-          options.malloc,
-          options.init,
-        ),
-      );
-    } else {
-      run_cpu(
-        &options,
-        StreamData::new_in(
-          options.arraysize,
-          DOUBLE_START_SCALAR,
-          DOUBLE_INIT,
-          dev,
-          alloc,
-          options.malloc,
-          options.init,
-        ),
-      );
-    }
-  };
-  let st_device = &|| {
-    let dev = SerialDevice { pin: options.pin };
-    if !options.csv {
-      println!("Using 1 thread, pin={}, alloc={}", options.pin, alloc_name);
-    }
-    if options.float {
-      run_cpu(
-        &options,
-        StreamData::new_in(
-          options.arraysize,
-          FLOAT_INIT_SCALAR,
-          FLOAT_INIT,
-          dev,
-          alloc,
-          options.malloc,
-          options.init,
-        ),
-      );
-    } else {
-      run_cpu(
-        &options,
-        StreamData::new_in(
-          options.arraysize,
-          DOUBLE_START_SCALAR,
-          DOUBLE_INIT,
-          dev,
-          alloc,
-          options.malloc,
-          options.init,
-        ),
-      );
-    }
-  };
-  let devices: Vec<(String, &'_ dyn Fn())> = vec![
-    ("CPU (Rayon)".to_string(), rayon_device),
-    (format!("CPU (Crossbeam, pinning={})", options.pin), crossbeam_device),
-    ("CPU (Single threaded)".to_string(), st_device),
-  ];
-
-  if options.list {
-    devices.iter().enumerate().for_each(|(i, (name, _))| {
-      println!("[{}] {}", i, name);
-    })
-  } else {
-    match devices.get(options.device) {
-      Some((name, run)) => {
-        if !&options.csv {
-          println!(
-            "BabelStream\n\
-                              Version: {}\n\
-                              Implementation: Rust; {}",
-            VERSION.unwrap_or("unknown"),
-            name
-          );
-          if options.init {
-            println!("Initialising arrays on main thread");
-          }
-        }
-        run();
-      }
-      None => eprintln!("Device index {} not available", options.device),
-    }
+  if !rust_stream::run(&std::env::args().collect::<Vec<_>>()) {
+    std::process::exit(1);
   }
 }
diff --git a/rust-stream/src/plain_stream.rs b/rust-stream/src/plain_stream.rs
index 7a1fb01..135a7bc 100644
--- a/rust-stream/src/plain_stream.rs
+++ b/rust-stream/src/plain_stream.rs
@@ -47,7 +47,7 @@ impl<T: ArrayType, A: AllocatorType> RustStream<T> for StreamData<T, SerialDevic
 
   fn nstream(&mut self) {
     for i in 0..self.size {
-      self.a[i] += self.b[i] * self.scalar * self.c[i];
+      self.a[i] += self.b[i] + self.scalar * self.c[i];
     }
   }
 
diff --git a/rust-stream/tests/integration_test.rs b/rust-stream/tests/integration_test.rs
new file mode 100644
index 0000000..101f8f8
--- /dev/null
+++ b/rust-stream/tests/integration_test.rs
@@ -0,0 +1,17 @@
+use rstest::rstest;
+
+#[rstest]
+fn test_main(
+  #[values(0, 1, 2)] device: usize,                             //
+  #[values("", "--pin")] pin: &str,                             //
+  #[values("", "--malloc")] malloc: &str,                       //
+  #[values("", "--init")] init: &str,                           //
+  #[values("", "--triad-only", "--nstream-only")] option: &str, //
+) {
+  let line = format!(
+    "rust-stream --arraysize 2048 --device {} {} {} {} {}",
+    device, pin, malloc, init, option
+  );
+  let args = line.split_whitespace().map(|s| s.to_string()).collect::<Vec<_>>();
+  assert!(rust_stream::run(&args));
+}