diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index c0c0353..08eed2d 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -11,6 +11,22 @@ on: jobs: + test-rust: + runs-on: ubuntu-18.04 + defaults: + run: + working-directory: ./src/rust/rust-stream + steps: + - uses: actions/checkout@v2 + - name: Setup project + run: rustup install nightly + - name: Compile project + run: cargo +nightly build --release + - name: Test project + run: cargo +nightly test --release + - name: Test run project + run: ./target/release/rust-stream --arraysize 2048 + test-java: runs-on: ubuntu-18.04 defaults: diff --git a/README.md b/README.md index 508b358..df95582 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ This code was previously called GPU-STREAM. - [Programming Models](#programming-models) - [How is this different to STREAM?](#how-is-this-different-to-stream) - [Building](#building) - - [CMake](#cmake) - - [GNU Make (removed)](#gnu-make) + - [CMake](#cmake) + - [GNU Make (removed)](#gnu-make) - [Results](#results) - [Contributing](#contributing) - [Citing](#citing) @@ -29,23 +29,23 @@ This code was previously called GPU-STREAM. BabelStream is currently implemented in the following parallel programming models, listed in no particular order: - - OpenCL - - CUDA - - HIP - - OpenACC - - OpenMP 3 and 4.5 - - C++ Parallel STL - - Kokkos - - RAJA - - SYCL and SYCL 2020 - - TBB - - Thrust (via CUDA or HIP) - +- OpenCL +- CUDA +- HIP +- OpenACC +- OpenMP 3 and 4.5 +- C++ Parallel STL +- Kokkos +- RAJA +- SYCL and SYCL 2020 +- TBB +- Thrust (via CUDA or HIP) This project also contains implementations in alternative languages with different build systems: * Julia - [JuliaStream.jl](./src/julia/JuliaStream.jl) * Java - [java-stream](./src/java/java-stream) * Scala - [scala-stream](./src/scala/scala-stream) +* Rust - [rust-stream](./src/rust/rust-stream) ## How is this different to STREAM? @@ -136,7 +136,7 @@ For example: Alternatively, refer to the [CI script](./src/ci-test-compile.sh), which test-compiles most of the models, and see which flags are used there. -*It is recommended that you delete the `build` directory when you change any of the build flags.* +*It is recommended that you delete the `build` directory when you change any of the build flags.* ### GNU Make @@ -170,15 +170,15 @@ Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM v2.0: Benchmarking * Deakin T, Price J, Martineau M, McIntosh-Smith S. Evaluating attainable memory bandwidth of parallel programming models via BabelStream. International Journal of Computational Science and Engineering. Special issue. Vol. 17, No. 3, pp. 247–262. 2018.DOI: 10.1504/IJCSE.2018.095847 * Deakin T, McIntosh-Smith S. GPU-STREAM: Benchmarking the achievable memory bandwidth of Graphics Processing Units. 2015. Poster session presented at IEEE/ACM SuperComputing, Austin, United States. -You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). + You can view the [Poster and Extended Abstract](http://sc15.supercomputing.org/sites/all/themes/SC15images/tech_poster/tech_poster_pages/post150.html). * Deakin T, Price J, Martineau M, McIntosh-Smith S. GPU-STREAM: Now in 2D!. 2016. Poster session presented at IEEE/ACM SuperComputing, Salt Lake City, United States. -You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html). + You can view the [Poster and Extended Abstract](http://sc16.supercomputing.org/sc-archive/tech_poster/tech_poster_pages/post139.html). * Raman K, Deakin T, Price J, McIntosh-Smith S. Improving achieved memory bandwidth from C++ codes on Intel Xeon Phi Processor (Knights Landing). IXPUG Spring Meeting, Cambridge, UK, 2017. * Deakin T, Price J, McIntosh-Smith S. Portable methods for measuring cache hierarchy performance. 2017. Poster sessions presented at IEEE/ACM SuperComputing, Denver, United States. -You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html) + You can view the [Poster and Extended Abstract](http://sc17.supercomputing.org/SC17%20Archive/tech_poster/tech_poster_pages/post155.html) [1]: McCalpin, John D., 1995: "Memory Bandwidth and Machine Balance in Current High Performance Computers", IEEE Computer Society Technical Committee on Computer Architecture (TCCA) Newsletter, December 1995. diff --git a/src/rust/rust-stream/.cargo/config.toml b/src/rust/rust-stream/.cargo/config.toml new file mode 100644 index 0000000..d5135e9 --- /dev/null +++ b/src/rust/rust-stream/.cargo/config.toml @@ -0,0 +1,2 @@ +[build] +rustflags = ["-C", "target-cpu=native"] \ No newline at end of file diff --git a/src/rust/rust-stream/.gitignore b/src/rust/rust-stream/.gitignore new file mode 100644 index 0000000..3a8cabc --- /dev/null +++ b/src/rust/rust-stream/.gitignore @@ -0,0 +1,2 @@ +/target +.idea diff --git a/src/rust/rust-stream/Cargo.lock b/src/rust/rust-stream/Cargo.lock new file mode 100644 index 0000000..5f225f0 --- /dev/null +++ b/src/rust/rust-stream/Cargo.lock @@ -0,0 +1,636 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "colour" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a27e4532f26f510c24bb8477d963c0c3ef27e293c3b2c507cccb0536d493201a" +dependencies = [ + "crossterm", +] + +[[package]] +name = "core_affinity" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8a03115cc34fb0d7c321dd154a3914b3ca082ccc5c11d91bf7117dbbe7171f" +dependencies = [ + "kernel32-sys", + "libc", + "num_cpus", + "winapi 0.2.8", +] + +[[package]] +name = "crossbeam" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" +dependencies = [ + "cfg-if", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b10ddc024425c88c2ad148c1b0fd53f4c6d38db9697c9f1588381212fa657c9" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +dependencies = [ + "cfg-if", + "lazy_static", +] + +[[package]] +name = "crossterm" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" +dependencies = [ + "bitflags", + "crossterm_winapi", + "lazy_static", + "libc", + "mio", + "parking_lot", + "signal-hook", + "winapi 0.3.9", +] + +[[package]] +name = "crossterm_winapi" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" + +[[package]] +name = "lock_api" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mio" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" +dependencies = [ + "libc", + "log", + "miow", + "ntapi", + "winapi 0.3.9", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi 0.3.9", +] + +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rstest" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + +[[package]] +name = "rust-stream" +version = "3.4.0" +dependencies = [ + "colour", + "core_affinity", + "crossbeam", + "libc", + "num-traits", + "num_cpus", + "rayon", + "rstest", + "rustversion", + "structopt", + "tabular", +] + +[[package]] +name = "rustc_version" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +dependencies = [ + "semver", +] + +[[package]] +name = "rustversion" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "semver" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" +dependencies = [ + "pest", +] + +[[package]] +name = "signal-hook" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" +dependencies = [ + "libc", + "mio", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "smallvec" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "structopt" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9788f4202aa75c240ecc9c15c65185e6a39ccdeb0fd5d008b98825464c87c" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "tabular" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7e35bee02dcefe64a74065b6b869d241eab1a02fea0d65e6074ce4e51894c3b" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + +[[package]] +name = "unicode-segmentation" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" + +[[package]] +name = "unicode-width" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/src/rust/rust-stream/Cargo.toml b/src/rust/rust-stream/Cargo.toml new file mode 100644 index 0000000..f0365a6 --- /dev/null +++ b/src/rust/rust-stream/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "rust-stream" +version = "3.4.0" +authors = ["Wei-Chen Lin "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +num-traits = "0.2.14" +structopt = "0.3.13" +tabular = "0.1.4" +rayon = "1.5.1" +crossbeam = "0.8.1" +num_cpus = "1.13.0" +rustversion = "1.0" +libc = "0.2.97" +core_affinity = "0.5.10" +colour = "0.6.0" + +[dev-dependencies] +rstest = "0.10.0" + +[build-dependencies] +rustversion = "1.0" + +[profile.dev] +opt-level = 2 +overflow-checks = true + + +[profile.release] +opt-level = 3 +lto = "thin" # fully enabling this (i.e true) negatively affects performance as tested on both AMD and Intel diff --git a/src/rust/rust-stream/README.md b/src/rust/rust-stream/README.md new file mode 100644 index 0000000..6696de5 --- /dev/null +++ b/src/rust/rust-stream/README.md @@ -0,0 +1,78 @@ +rust-stream +=========== + +This is an implementation of BabelStream in Rust. + +Currently, we support three CPU threading API as devices: + +* Plain - basic single-threaded `for` version, see [plain_stream.rs](src/plain_stream.rs) +* [Rayon](https://github.com/rayon-rs/rayon) - Parallel with high level API, + see [rayon_stream.rs](src/rayon_stream.rs) +* [Crossbeam](https://github.com/crossbeam-rs/crossbeam) - Parallel with partitions per thread, + see [crossbeam_stream.rs](src/crossbeam_stream.rs) +* Arc - Parallel with `Vec` per thread (static partitions) wrapped in `Mutex` contained in `Arc`s, + see [crossbeam_stream.rs](src/arc_stream.rs) +* Unsafe - Parallel with unsafe pointer per thread (static partitions) to `Vec`, + see [crossbeam_stream.rs](src/unsafe_stream.rs) + +In addition, this implementation also supports the following extra flags: +**** +``` +--init Initialise each benchmark array at allocation time on the main thread +--malloc Use libc malloc instead of the Rust's allocator for benchmark array allocation +--pin Pin threads to distinct cores, this has NO effect in Rayon devices +``` + +Max thread count is controlled by the environment variable `BABELSTREAM_NUM_THREADS` which is compatible for all devices (avoid setting `RAYON_NUM_THREADS`, the implementation will issue a warning if this happened). + +There is an ongoing investigation on potential performance issues under NUMA situations. As part of +the experiment, this implementation made use of the +provisional [Allocator traits](https://github.com/rust-lang/rust/issues/32838) which requires rust +unstable. We hope a NUMA aware allocator will be available once the allocator API reaches rust +stable. + +### Build & Run + +Prerequisites: + +* [Rust toolchain](https://www.rust-lang.org/tools/install) + +Once the toolchain is installed, enable the nightly channel: + +```shell +> rustup install nightly +> rustup default nightly # optional, this sets `+nightly` automatically for cargo calls later +``` + +With `cargo` on path, compile and run the benchmark with: + +```shell +> cd rust-stream/ +> cargo +nightly build --release # or simply `cargo build --release` if nightly channel is the default +> ./target/release/rust-stream --help +rust-stream 3.4.0 + +USAGE: + rust-stream [FLAGS] [OPTIONS] + +FLAGS: + --csv Output as csv table + --float Use floats (rather than doubles) + -h, --help Prints help information + --init Initialise each benchmark array at allocation time on the main thread + --list List available devices + --malloc Use libc malloc instead of the Rust's allocator for benchmark array allocation + --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6) + --nstream-only Only run nstream + --pin Pin threads to distinct cores, this has NO effect in Rayon devices + --triad-only Only run triad + -V, --version Prints version information + +OPTIONS: + -s, --arraysize Use elements in the array [default: 33554432] + --device Select device at [default: 0] + -n, --numtimes Run the test times (NUM >= 2) [default: 100] +``` + + + \ No newline at end of file diff --git a/src/rust/rust-stream/rustfmt.toml b/src/rust/rust-stream/rustfmt.toml new file mode 100644 index 0000000..aa2f0e9 --- /dev/null +++ b/src/rust/rust-stream/rustfmt.toml @@ -0,0 +1,68 @@ +max_width = 100 +hard_tabs = false +tab_spaces = 2 +newline_style = "Auto" +use_small_heuristics = "Max" +indent_style = "Block" +wrap_comments = false +format_code_in_doc_comments = false +comment_width = 80 +normalize_comments = false +normalize_doc_attributes = false +license_template_path = "" +format_strings = false +format_macro_matchers = false +format_macro_bodies = true +empty_item_single_line = true +struct_lit_single_line = true +fn_single_line = true +where_single_line = true +imports_indent = "Block" +imports_layout = "Mixed" +imports_granularity = "Preserve" +group_imports = "Preserve" +reorder_imports = true +reorder_modules = true +reorder_impl_items = false +type_punctuation_density = "Wide" +space_before_colon = false +space_after_colon = true +spaces_around_ranges = false +binop_separator = "Front" +remove_nested_parens = true +combine_control_expr = true +overflow_delimited_expr = false +struct_field_align_threshold = 0 +enum_discrim_align_threshold = 0 +match_arm_blocks = true +match_arm_leading_pipes = "Never" +force_multiline_blocks = false +fn_args_layout = "Compressed" +brace_style = "PreferSameLine" +control_brace_style = "AlwaysSameLine" +trailing_semicolon = true +trailing_comma = "Vertical" +match_block_trailing_comma = false +blank_lines_upper_bound = 1 +blank_lines_lower_bound = 0 +edition = "2015" +version = "One" +inline_attribute_width = 0 +merge_derives = true +use_try_shorthand = false +use_field_init_shorthand = false +force_explicit_abi = true +condense_wildcard_suffixes = false +color = "Auto" +required_version = "1.4.38" +unstable_features = false +disable_all_formatting = false +skip_children = false +hide_parse_errors = false +error_on_line_overflow = false +error_on_unformatted = false +report_todo = "Never" +report_fixme = "Never" +ignore = [] +emit_mode = "Files" +make_backup = false diff --git a/src/rust/rust-stream/src/arc_stream.rs b/src/rust/rust-stream/src/arc_stream.rs new file mode 100644 index 0000000..006f73a --- /dev/null +++ b/src/rust/rust-stream/src/arc_stream.rs @@ -0,0 +1,254 @@ +use std::iter::Sum; +use std::sync::{Arc, Mutex}; + +use self::core_affinity::CoreId; +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +struct ArcHeapData { + a_chunks: Vec>>>, + b_chunks: Vec>>>, + c_chunks: Vec>>>, +} + +pub struct ArcDevice { + pub(crate) ncore: usize, + pub(crate) pin: bool, + pub(crate) core_ids: Vec, + data: ArcHeapData, +} + +impl ArcDevice { + pub fn new(ncore: usize, pin: bool, alloc: A) -> Self { + let mut core_ids = match core_affinity::get_core_ids() { + Some(xs) => xs, + None => { + colour::e_red_ln!("Cannot enumerate cores, pinning will not work if enabled"); + (0..ncore).map(|i| CoreId { id: i }).collect() + } + }; + core_ids.resize(ncore, core_ids[0]); + + let lift = + || (0..ncore).map(|_| return Arc::new(Mutex::new(Vec::new_in(alloc)))).collect::>(); + let data = ArcHeapData { a_chunks: lift(), b_chunks: lift(), c_chunks: lift() }; + + ArcDevice { ncore, pin, core_ids, data } + } + + pub fn ref_a(&self, t: usize) -> Arc>> { self.data.a_chunks[t].clone() } + + pub fn ref_b(&self, t: usize) -> Arc>> { self.data.b_chunks[t].clone() } + + pub fn ref_c(&self, t: usize) -> Arc>> { self.data.c_chunks[t].clone() } + + // divide the length by the number of cores, the last core gets less work if it does not divide + fn chunk_size(&self, len: usize, t: usize) -> usize { + assert!(t < self.ncore); + let chunk = (len as f64 / self.ncore as f64).ceil() as usize; + if t == self.ncore - 1 { + len - (t * chunk) + } else { + chunk + } + } +} + +extern crate core_affinity; + +// Arc+Mutex threaded version, it should be semantically equal to the single threaded version +impl + RustStream for StreamData, A> +{ + fn init_arrays(&mut self) { + let init = self.init; + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + ref_a.lock().unwrap().resize(n, init.0); + ref_b.lock().unwrap().resize(n, init.1); + ref_c.lock().unwrap().resize(n, init.2); + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + fn read_arrays(&mut self) { + let range = self.size; + let unlift = |drain: &mut Vec, source: &Vec>>>| { + let xs = + source.into_iter().flat_map(|x| x.lock().unwrap().clone().into_iter()).collect::>(); + for i in 0..range { + drain[i] = xs[i]; + } + }; + unlift(&mut self.a, &self.device.data.a_chunks); + unlift(&mut self.b, &self.device.data.b_chunks); + unlift(&mut self.c, &self.device.data.c_chunks); + } + + fn copy(&mut self) { + let pin = self.device.pin; + (0..self.device.ncore) + .map(move |t| { + let ref_a = self.device.ref_a(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = ref_a.lock().unwrap(); + let mut c = ref_c.lock().unwrap(); + for i in 0..n { + c[i] = a[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn mul(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + (0..self.device.ncore) + .map(move |t| { + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let mut b = ref_b.lock().unwrap(); + let c = ref_c.lock().unwrap(); + for i in 0..n { + b[i] = scalar * c[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn add(&mut self) { + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let mut c = ref_c.lock().unwrap(); + for i in 0..n { + c[i] = a[i] + b[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn triad(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let mut a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let c = ref_c.lock().unwrap(); + for i in 0..n { + a[i] = b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn nstream(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let ref_c = self.device.ref_c(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let mut a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let c = ref_c.lock().unwrap(); + for i in 0..n { + a[i] += b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn dot(&mut self) -> T { + let pin = self.device.pin; + (0..self.device.ncore) + .map(&|t| { + let ref_a = self.device.ref_a(t); + let ref_b = self.device.ref_b(t); + let core = self.device.core_ids[t]; + let n = self.device.chunk_size(self.size, t); + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = ref_a.lock().unwrap(); + let b = ref_b.lock().unwrap(); + let mut p = T::default(); + for i in 0..n { + p += a[i] * b[i]; + } + p + }) + }) + .collect::>() + .into_iter() + .map(|t| t.join().unwrap()) + .sum() + } +} diff --git a/src/rust/rust-stream/src/crossbeam_stream.rs b/src/rust/rust-stream/src/crossbeam_stream.rs new file mode 100644 index 0000000..44358ae --- /dev/null +++ b/src/rust/rust-stream/src/crossbeam_stream.rs @@ -0,0 +1,221 @@ +use std::iter::Sum; +use std::slice::{Chunks, ChunksMut}; + +use crossbeam::thread; + +use self::core_affinity::CoreId; +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +pub struct CrossbeamDevice { + pub(crate) ncore: usize, + pub(crate) pin: bool, + pub(crate) core_ids: Vec, +} + +impl CrossbeamDevice { + pub fn new(ncore: usize, pin: bool) -> Self { + let mut core_ids = match core_affinity::get_core_ids() { + Some(xs) => xs, + None => { + colour::e_red_ln!("Cannot enumerate cores, pinning will not work if enabled"); + (0..ncore).map(|i| CoreId { id: i }).collect() + } + }; + core_ids.resize(ncore, core_ids[0]); + CrossbeamDevice { ncore, pin, core_ids } + } +} + +impl CrossbeamDevice { + // divide the length by the number of cores, the last core gets less work if it does not divide + fn chunk_size(&self, len: usize) -> usize { (len as f64 / self.ncore as f64).ceil() as usize } + + // make a mutable chunk from the vec + fn mk_mut_chunks<'a, T, A: AllocatorType>(&self, xs: &'a mut Vec) -> ChunksMut<'a, T> { + let len = xs.len(); + xs.chunks_mut(self.chunk_size(len)) + } + + // make a immutable chunk from the vec + fn mk_chunks<'a, T, A: AllocatorType>(&self, xs: &'a mut Vec) -> Chunks<'a, T> { + xs.chunks(self.chunk_size(xs.len())) + } +} + +extern crate core_affinity; + +// Crossbeam threaded version, it should be semantically equal to the single threaded version +impl RustStream + for StreamData +{ + fn init_arrays(&mut self) { + thread::scope(|s| { + let init = self.init; + let pin = self.device.pin; + for (t, ((a, b), c)) in self.device.core_ids.iter().zip( + self + .device + .mk_mut_chunks(&mut self.a) + .zip(self.device.mk_mut_chunks(&mut self.b)) + .zip(self.device.mk_mut_chunks(&mut self.c)), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for x in a.into_iter() { + *x = init.0; + } + for x in b.into_iter() { + *x = init.1; + } + for x in c.into_iter() { + *x = init.2; + } + }); + } + }) + .unwrap() + } + + fn copy(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + for (t, (c, a)) in self + .device + .core_ids + .iter() + .zip(self.device.mk_mut_chunks(&mut self.c).zip(self.device.mk_chunks(&mut self.a))) + { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..c.len() { + c[i] = a[i]; + } + }); + } + }) + .unwrap() + } + + fn mul(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + let scalar = self.scalar; + for (t, (b, c)) in self + .device + .core_ids + .iter() + .zip(self.device.mk_mut_chunks(&mut self.b).zip(self.device.mk_chunks(&mut self.c))) + { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..b.len() { + b[i] = scalar * c[i]; + } + }); + } + }) + .unwrap() + } + + fn add(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + for (t, (c, (a, b))) in (&mut self.device.core_ids.iter()).zip( + self + .device + .mk_mut_chunks(&mut self.c) + .zip(self.device.mk_chunks(&mut self.a).zip(self.device.mk_chunks(&mut self.b))), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..c.len() { + c[i] = a[i] + b[i]; + } + }); + } + }) + .unwrap() + } + + fn triad(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + let scalar = self.scalar; + for (t, (a, (b, c))) in self.device.core_ids.iter().zip( + self + .device + .mk_mut_chunks(&mut self.a) + .zip(self.device.mk_chunks(&mut self.b).zip(self.device.mk_chunks(&mut self.c))), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..a.len() { + a[i] = b[i] + scalar * c[i] + } + }); + } + }) + .unwrap() + } + + fn nstream(&mut self) { + thread::scope(|s| { + let pin = self.device.pin; + let scalar = self.scalar; + for (t, (a, (b, c))) in self.device.core_ids.iter().zip( + self + .device + .mk_mut_chunks(&mut self.a) + .zip(self.device.mk_chunks(&mut self.b).zip(self.device.mk_chunks(&mut self.c))), + ) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + for i in 0..a.len() { + a[i] += b[i] + scalar * c[i] + } + }); + } + }) + .unwrap() + } + + fn dot(&mut self) -> T { + let mut partial_sum = vec![T::zero(); self.device.ncore]; + thread::scope(|s| { + let pin = self.device.pin; + let a = &self.a; + let b = &self.b; + let chunk_indices = |i: usize| { + let chunk_size = self.device.chunk_size(self.size); + let start = i * chunk_size; + start..((start + chunk_size).min(self.size)) + }; + for (t, (n, acc)) in self.device.core_ids.iter().zip(partial_sum.iter_mut().enumerate()) { + s.spawn(move |_| { + if pin { + core_affinity::set_for_current(*t); + } + let mut p = T::zero(); + for i in chunk_indices(n) { + p += a[i] * b[i]; + } + *acc = p; + }); + } + }) + .unwrap(); + partial_sum.into_iter().sum() + } +} diff --git a/src/rust/rust-stream/src/lib.rs b/src/rust/rust-stream/src/lib.rs new file mode 100644 index 0000000..3ac72c3 --- /dev/null +++ b/src/rust/rust-stream/src/lib.rs @@ -0,0 +1,435 @@ +#![feature(allocator_api)] +#![feature(vec_into_raw_parts)] + +use std::alloc::System; +use std::env; +use std::fmt::{Debug, Display}; +use std::iter::Sum; +use std::mem::size_of; +use std::time::Duration; + +use num_traits::abs; +use structopt::StructOpt; +use tabular::{Row, Table}; + +use crate::arc_stream::ArcDevice; +use crate::crossbeam_stream::CrossbeamDevice; +use crate::plain_stream::SerialDevice; +use crate::rayon_stream::RayonDevice; +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; +use crate::unsafe_stream::UnsafeDevice; + +mod arc_stream; +mod crossbeam_stream; +mod plain_stream; +mod rayon_stream; +mod stream; +mod unsafe_stream; + +#[derive(Debug, StructOpt)] +struct Options { + /// List available devices + #[structopt(long)] + list: bool, + /// Select device at + #[structopt(long, default_value = "0")] + device: usize, + /// Run the test times (NUM >= 2) + #[structopt(long, short = "n", default_value = "100")] + numtimes: usize, + /// Use elements in the array + #[structopt(long, short = "s", default_value = "33554432")] + arraysize: usize, + /// Use floats (rather than doubles) + #[structopt(long)] + float: bool, + /// Only run triad + #[structopt(long)] + triad_only: bool, + /// Only run nstream + #[structopt(long)] + nstream_only: bool, + /// Output as csv table + #[structopt(long)] + csv: bool, + /// Use MiB=2^20 for bandwidth calculation (default MB=10^6) + #[structopt(long)] + mibibytes: bool, + /// Use libc malloc instead of the Rust's allocator for benchmark array allocation + #[structopt(name = "malloc", long)] + malloc: bool, + /// Initialise each benchmark array at allocation time on the main thread + #[structopt(name = "init", long)] + init: bool, + /// Pin threads to distinct cores, this has NO effect in Rayon devices + #[structopt(long)] + pin: bool, +} + +#[derive(PartialEq)] +enum Benchmark { + All, + Triad, + NStream, +} + +fn check_solution, D, A: AllocatorType>( + benchmark: Benchmark, numtimes: usize, vec: &StreamData, dot_sum: Option, +) -> bool { + let (mut gold_a, mut gold_b, mut gold_c) = vec.init; + for _ in 0..numtimes { + match benchmark { + Benchmark::All => { + gold_c = gold_a; + gold_b = vec.scalar * gold_c; + gold_c = gold_a + gold_b; + gold_a = gold_b + vec.scalar * gold_c; + } + Benchmark::Triad => { + gold_a = gold_b + vec.scalar * gold_c; + } + Benchmark::NStream => { + gold_a += gold_b + vec.scalar * gold_c; + } + }; + } + let tolerance = T::epsilon().into() * 100.0f64; + let validate_xs = |name: &str, xs: &Vec, from: T| { + let error = (xs.iter().map(|x| abs(*x - from)).sum::()).into() / xs.len() as f64; + let fail = error > tolerance; + if fail { + eprintln!("Validation failed on {}[]. Average error {} ", name, error); + } + !fail + }; + let a_ok = validate_xs("a", &vec.a, gold_a); + let b_ok = validate_xs("b", &vec.b, gold_b); + let c_ok = validate_xs("c", &vec.c, gold_c); + let dot_ok = dot_sum.map_or(true, |sum| { + let gold_sum = (gold_a * gold_b).into() * vec.size as f64; + let error = abs((sum.into() - gold_sum) / gold_sum); + let fail = error > 1.0e-8; + if fail { + eprintln!( + "Validation failed on sum. Error {} \nSum was {} but should be {}", + error, sum, gold_sum + ); + } + !fail + }); + + a_ok && b_ok && c_ok && dot_ok +} + +fn run_cpu + Display, D, A: AllocatorType>( + option: &Options, mut stream: StreamData, +) -> bool +where StreamData: RustStream { + let benchmark = match (option.nstream_only, option.triad_only) { + (true, false) => Benchmark::NStream, + (false, true) => Benchmark::Triad, + (false, false) => Benchmark::All, + (true, true) => { + panic!("Both triad and nstream are enabled, pick one or omit both to run all benchmarks") + } + }; + + let array_bytes = option.arraysize * size_of::(); + let total_bytes = array_bytes * 3; + let (mega_scale, mega_suffix, giga_scale, giga_suffix) = if !option.mibibytes { + (1.0e-6, "MB", 1.0e-9, "GB") + } else { + (2f64.powi(-20), "MiB", 2f64.powi(-30), "GiB") + }; + + if !option.csv { + println!( + "Running {} {} times", + match benchmark { + Benchmark::All => "kernels", + Benchmark::Triad => "triad", + Benchmark::NStream => "nstream", + }, + option.numtimes + ); + + if benchmark == Benchmark::Triad { + println!("Number of elements: {}", option.arraysize); + } + + println!("Precision: {}", if option.float { "float" } else { "double" }); + println!( + "Array size: {:.1} {} (={:.1} {})", + mega_scale * array_bytes as f64, + mega_suffix, + giga_scale * array_bytes as f64, + giga_suffix + ); + println!( + "Total size: {:.1} {} (={:.1} {})", + mega_scale * total_bytes as f64, + mega_suffix, + giga_scale * total_bytes as f64, + giga_suffix + ); + } + + stream.init_arrays(); + + let tabulate = |xs: &Vec, name: &str, t_size: usize| -> Vec<(&str, String)> { + let tail = &xs[1..]; // tail only + + // do stats + let max = tail.iter().max().map(|d| d.as_secs_f64()); + let min = tail.iter().min().map(|d| d.as_secs_f64()); + match (min, max) { + (Some(min), Some(max)) => { + let avg: f64 = tail.iter().map(|d| d.as_secs_f64()).sum::() / tail.len() as f64; + let mbps = mega_scale * (t_size as f64) / min; + if option.csv { + vec![ + ("function", name.to_string()), + ("num_times", option.numtimes.to_string()), + ("n_elements", option.arraysize.to_string()), + ("sizeof", t_size.to_string()), + ( + if option.mibibytes { "max_mibytes_per_sec" } else { "max_mbytes_per_sec" }, + mbps.to_string(), + ), + ("min_runtime", min.to_string()), + ("max_runtime", max.to_string()), + ("avg_runtime", avg.to_string()), + ] + } else { + vec![ + ("Function", name.to_string()), + (if option.mibibytes { "MiBytes/sec" } else { "MBytes/sec" }, format!("{:.3}", mbps)), + ("Min (sec)", format!("{:.5}", min)), + ("Max", format!("{:.5}", max)), + ("Average", format!("{:.5}", avg)), + ] + } + } + (_, _) => panic!("No min/max element for {}(size={})", name, t_size), + } + }; + + let tabulate_all = |xs: Vec>| { + match xs.as_slice() { + [head, ..] => { + if option.csv { + println!("{}", head.iter().map(|(col, _)| *col).collect::>().join(",")); + for kvs in xs { + println!("{}", kvs.iter().map(|(_, val)| val.clone()).collect::>().join(",")); + } + } else { + let mut table = Table::new(&vec!["{:<}"; head.len()].join(" ")); + table.add_row(head.iter().fold(Row::new(), |row, (col, _)| row.with_cell(col))); + for kvs in xs { + table.add_row(kvs.iter().fold(Row::new(), |row, (_, val)| row.with_cell(val))); + } + print!("{}", table); + } + } + _ => panic!("Empty tabulation"), + }; + }; + + let solutions_correct = match benchmark { + Benchmark::All => { + let (results, sum) = stream.run_all(option.numtimes); + stream.read_arrays(); + let correct = check_solution(benchmark, option.numtimes, &stream, Some(sum)); + tabulate_all(vec![ + tabulate(&results.copy, "Copy", 2 * array_bytes), + tabulate(&results.mul, "Mul", 2 * array_bytes), + tabulate(&results.add, "Add", 3 * array_bytes), + tabulate(&results.triad, "Triad", 3 * array_bytes), + tabulate(&results.dot, "Dot", 2 * array_bytes), + ]); + correct + } + Benchmark::NStream => { + let results = stream.run_nstream(option.numtimes); + stream.read_arrays(); + let correct = check_solution(benchmark, option.numtimes, &stream, None); + tabulate_all(vec![tabulate(&results, "Nstream", 4 * array_bytes)]); + correct + } + Benchmark::Triad => { + let results = stream.run_triad(option.numtimes); + stream.read_arrays(); + let correct = check_solution(benchmark, option.numtimes, &stream, None); + let total_bytes = 3 * array_bytes * option.numtimes; + let bandwidth = giga_scale * (total_bytes as f64 / results.as_secs_f64()); + println!("Runtime (seconds): {:.5}", results.as_secs_f64()); + println!("Bandwidth ({}/s): {:.3} ", giga_suffix, bandwidth); + correct + } + }; + stream.clean_up(); + solutions_correct +} + +const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); + +static START_A: f32 = 0.1; +static START_B: f32 = 0.2; +static START_C: f32 = 0.0; +static START_SCALAR: f32 = 0.4; + +static FLOAT_INIT_SCALAR: f32 = START_SCALAR; +static FLOAT_INIT: (f32, f32, f32) = (START_A, START_B, START_C); + +static DOUBLE_INIT_SCALAR: f64 = START_SCALAR as f64; +static DOUBLE_INIT: (f64, f64, f64) = (START_A as f64, START_B as f64, START_C as f64); + +pub fn run(args: &Vec) -> bool { + let opt: Options = Options::from_iter(args); + + if opt.numtimes < 2 { + panic!("numtimes must be >= 2") + } + + let alloc = System; + let alloc_name = if opt.malloc { "libc-malloc" } else { "rust-system" }; + + fn mk_data( + opt: &Options, init: (T, T, T), scalar: T, dev: D, alloc: A, + ) -> StreamData { + StreamData::new_in(opt.arraysize, scalar, init, dev, alloc, opt.malloc, opt.init) + } + + let num_thread_key = "BABELSTREAM_NUM_THREADS"; + let max_ncores = num_cpus::get(); + let ncores = match env::var(num_thread_key) { + Ok(v) => match v.parse::() { + Err(bad) => { + colour::e_yellow_ln!( + "Cannot parse {} (reason: {}), defaulting to {}", + bad, + num_thread_key, + max_ncores + ); + max_ncores + } + Ok(n) if n <= 0 || n > max_ncores as i64 => { + println!("{} out of bound ({}), defaulting to {}", num_thread_key, n, max_ncores); + max_ncores + } + Ok(n) => n as usize, + }, + Err(_) => { + println!("{} not set, defaulting to max ({})", num_thread_key, max_ncores); + max_ncores + } + }; + + let rayon_device = &|| { + let rayon_num_thread_key = "RAYON_NUM_THREADS"; + if env::var(rayon_num_thread_key).is_ok() { + colour::e_yellow_ln!("{} is ignored, set {} instead", rayon_num_thread_key, num_thread_key) + } + let dev = RayonDevice { + pool: rayon::ThreadPoolBuilder::default().num_threads(ncores).build().unwrap(), + }; + if !opt.csv { + println!("Using {} thread(s), alloc={}", dev.pool.current_num_threads(), alloc_name); + if opt.pin { + colour::e_yellow_ln!("Pinning threads have no effect on Rayon!") + } + } + if opt.float { + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let arc_device = &|| { + if !opt.csv { + println!("Using {} thread, pin={}, alloc={}", ncores, opt.pin, alloc_name); + } + if opt.float { + let dev = ArcDevice::::new(ncores, opt.pin, alloc); + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + let dev = ArcDevice::::new(ncores, opt.pin, alloc); + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let unsafe_device = &|| { + if !opt.csv { + println!("Using {} thread, pin={}, alloc={}", ncores, opt.pin, alloc_name); + } + if opt.float { + let dev = UnsafeDevice::::new(ncores, opt.pin); + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + let dev = UnsafeDevice::::new(ncores, opt.pin); + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let crossbeam_device = &|| { + let dev = CrossbeamDevice::new(ncores, opt.pin); + if !opt.csv { + println!("Using {} thread(s), pin={}, alloc={}", ncores, opt.pin, alloc_name) + } + if opt.float { + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let st_device = &|| { + let dev = SerialDevice { pin: opt.pin }; + if !opt.csv { + println!("Using 1 thread, pin={}, alloc={}", opt.pin, alloc_name); + } + if opt.float { + run_cpu(&opt, mk_data(&opt, FLOAT_INIT, FLOAT_INIT_SCALAR, dev, alloc)) + } else { + run_cpu(&opt, mk_data(&opt, DOUBLE_INIT, DOUBLE_INIT_SCALAR, dev, alloc)) + } + }; + + let devices: Vec<(String, &'_ dyn Fn() -> bool)> = vec![ + ("CPU (Single threaded)".to_string(), st_device), + ("CPU (Rayon)".to_string(), rayon_device), + (format!("CPU (Arc, pinning={})", opt.pin), arc_device), + (format!("CPU (Unsafe, pinning={})", opt.pin), unsafe_device), + (format!("CPU (Crossbeam, pinning={})", opt.pin), crossbeam_device), + ]; + + if opt.list { + devices.iter().enumerate().for_each(|(i, (name, _))| { + println!("[{}] {}", i, name); + }); + true + } else { + match devices.get(opt.device) { + Some((name, run)) => { + if !&opt.csv { + println!( + "BabelStream\n\ + Version: {}\n\ + Implementation: Rust; {}", + VERSION.unwrap_or("unknown"), + name + ); + if opt.init { + println!("Initialising arrays on main thread"); + } + } + run() + } + None => { + eprintln!("Device index {} not available", opt.device); + false + } + } + } +} diff --git a/src/rust/rust-stream/src/main.rs b/src/rust/rust-stream/src/main.rs new file mode 100644 index 0000000..8c99087 --- /dev/null +++ b/src/rust/rust-stream/src/main.rs @@ -0,0 +1,5 @@ +fn main() { + if !rust_stream::run(&std::env::args().collect::>()) { + std::process::exit(1); + } +} diff --git a/src/rust/rust-stream/src/plain_stream.rs b/src/rust/rust-stream/src/plain_stream.rs new file mode 100644 index 0000000..135a7bc --- /dev/null +++ b/src/rust/rust-stream/src/plain_stream.rs @@ -0,0 +1,61 @@ +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; +use core_affinity::CoreId; + +pub struct SerialDevice { + pub(crate) pin: bool, +} + +// single threaded version +impl RustStream for StreamData { + fn init_arrays(&mut self) { + if self.device.pin { + core_affinity::set_for_current( + match core_affinity::get_core_ids().as_ref().map(|x| x.first()) { + Some(Some(x)) => *x, + _ => CoreId { id: 0 }, + }, + ); + } + self.a.fill(self.init.0); + self.b.fill(self.init.1); + self.c.fill(self.init.2); + } + + fn copy(&mut self) { + for i in 0..self.size { + self.c[i] = self.a[i]; + } + } + + fn mul(&mut self) { + for i in 0..self.size { + self.b[i] = self.scalar * self.c[i]; + } + } + + fn add(&mut self) { + for i in 0..self.size { + self.c[i] = self.a[i] + self.b[i]; + } + } + + fn triad(&mut self) { + for i in 0..self.size { + self.a[i] = self.b[i] + self.scalar * self.c[i]; + } + } + + fn nstream(&mut self) { + for i in 0..self.size { + self.a[i] += self.b[i] + self.scalar * self.c[i]; + } + } + + fn dot(&mut self) -> T { + let mut sum = T::default(); + for i in 0..self.size { + sum += self.a[i] * self.b[i]; + } + sum + } +} diff --git a/src/rust/rust-stream/src/rayon_stream.rs b/src/rust/rust-stream/src/rayon_stream.rs new file mode 100644 index 0000000..d25d115 --- /dev/null +++ b/src/rust/rust-stream/src/rayon_stream.rs @@ -0,0 +1,77 @@ +use std::iter::Sum; + +use rayon::prelude::*; +use rayon::ThreadPool; + +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +pub struct RayonDevice { + pub(crate) pool: ThreadPool, +} + +// Rayon version, it should be semantically equal to the single threaded version +impl RustStream + for StreamData +{ + fn init_arrays(&mut self) { + let init = self.init; + self.a.par_iter_mut().for_each(|v| *v = init.0); + self.b.par_iter_mut().for_each(|v| *v = init.1); + self.c.par_iter_mut().for_each(|v| *v = init.2); + } + + fn copy(&mut self) { + let a = &self.a; + let c = &mut self.c; + self.device.pool.install(|| { + (*c).par_iter_mut().enumerate().for_each(|(i, c)| *c = a[i]); + }); + } + + fn mul(&mut self) { + let scalar = self.scalar; + let c = &self.c; + let b = &mut self.b; + self + .device + .pool + .install(|| (*b).par_iter_mut().enumerate().for_each(|(i, b)| *b = scalar * c[i])); + } + + fn add(&mut self) { + let a = &self.a; + let b = &self.b; + let c = &mut self.c; + self.device.pool.install(|| (*c).par_iter_mut().enumerate().for_each(|(i, c)| *c = a[i] + b[i])) + } + + fn triad(&mut self) { + let scalar = self.scalar; + let a = &mut self.a; + let b = &self.b; + let c = &self.c; + self + .device + .pool + .install(|| (*a).par_iter_mut().enumerate().for_each(|(i, a)| *a = b[i] + scalar * c[i])) + } + + fn nstream(&mut self) { + let scalar = self.scalar; + let a = &mut self.a; + let b = &self.b; + let c = &self.c; + self + .device + .pool + .install(|| (*a).par_iter_mut().enumerate().for_each(|(i, a)| *a += b[i] + scalar * c[i])) + } + + fn dot(&mut self) -> T { + let a = &self.a; + let b = &self.b; + self.device.pool.install(|| { + (0..self.size).into_par_iter().fold(|| T::default(), |acc, i| acc + a[i] * b[i]).sum::() + }) + } +} diff --git a/src/rust/rust-stream/src/stream.rs b/src/rust/rust-stream/src/stream.rs new file mode 100644 index 0000000..560c6f1 --- /dev/null +++ b/src/rust/rust-stream/src/stream.rs @@ -0,0 +1,167 @@ +use num_traits::real::Real; +use num_traits::{NumAssign, Signed}; +use std::alloc::Allocator; +use std::fmt::Debug; +use std::time::{Duration, Instant}; + +pub trait AllocatorType: Allocator + Copy + Clone + Default + Debug {} +impl AllocatorType for T {} + +pub struct StreamData { + pub device: D, + pub size: usize, + pub scalar: T, + pub init: (T, T, T), + pub a: Vec, + pub b: Vec, + pub c: Vec, + pub needs_dealloc: bool, +} + +#[inline(always)] +fn timed(f: F) -> Duration { + let start = Instant::now(); + f(); + start.elapsed() +} + +#[inline(always)] +fn timed_mut T>(f: &mut F) -> (Duration, T) { + let start = Instant::now(); + let x = f(); + (start.elapsed(), x) +} + +pub struct AllTiming { + pub copy: T, + pub mul: T, + pub add: T, + pub triad: T, + pub dot: T, +} + +pub trait ArrayType: Real + NumAssign + Signed + Default + Debug {} +impl ArrayType for T {} + +impl StreamData { + pub fn new_in( + size: usize, + scalar: T, + init: (T, T, T), + device: D, + allocator: A, + malloc: bool, // + initialise: bool, // + ) -> StreamData { + let mk_vec = || { + if malloc { + extern crate libc; + use std::mem; + unsafe { + // we do the typical C malloc with a NULL check here + let bytes = mem::size_of::() * size; + let ptr = libc::malloc(bytes as libc::size_t) as *mut T; + if ptr.is_null() { + panic!( + "Cannot allocate {} bytes in `sizeof(T) * size` (T = {}, size = {})", + bytes, + mem::size_of::(), + size + ); + } + let mut xs = Vec::from_raw_parts_in(ptr, size, size, allocator); + if initialise { + xs.fill(T::default()); + } + xs + } + } else { + if initialise { + let mut xs = Vec::new_in(allocator); + xs.resize(size, T::default()); + xs + } else { + // try not to touch the vec after allocation + let mut xs = Vec::with_capacity_in(size, allocator); + unsafe { + xs.set_len(size); + } + xs + } + } + }; + + StreamData { + device, + size, + scalar, + init, + a: mk_vec(), + b: mk_vec(), + c: mk_vec(), + needs_dealloc: malloc, + } + } + pub fn clean_up(self) { + if self.needs_dealloc { + unsafe { + extern crate libc; + let free_ts = move |xs: Vec| { + // make sure we don't call dealloc for vec anymore + // XXX it's important we don't free xs.as_mut_ptr() here and use xs.into_raw_parts_with_alloc() + // as that function handles drops semantic for us + // if we free the the raw ptr directly, the compiler will still drop the vec and then segfault + let (ptr, _, _, _) = xs.into_raw_parts_with_alloc(); + libc::free(ptr as *mut libc::c_void); + }; + free_ts(self.a); + free_ts(self.b); + free_ts(self.c); + } + } + } +} + +pub trait RustStream { + fn init_arrays(&mut self); + fn read_arrays(&mut self) {} // default to no-op as most impl. doesn't need this + fn copy(&mut self); + fn mul(&mut self); + fn add(&mut self); + fn triad(&mut self); + fn nstream(&mut self); + fn dot(&mut self) -> T; + + fn run_all(&mut self, n: usize) -> (AllTiming>, T) { + let mut timings: AllTiming> = AllTiming { + copy: vec![Duration::default(); n], + mul: vec![Duration::default(); n], + add: vec![Duration::default(); n], + triad: vec![Duration::default(); n], + dot: vec![Duration::default(); n], + }; + let mut last_sum = T::default(); + for i in 0..n { + timings.copy[i] = timed(|| self.copy()); + timings.mul[i] = timed(|| self.mul()); + timings.add[i] = timed(|| self.add()); + timings.triad[i] = timed(|| self.triad()); + let (dot, sum) = timed_mut(&mut || self.dot()); + timings.dot[i] = dot; + last_sum = sum; + } + (timings, last_sum) + } + + fn run_triad(&mut self, n: usize) -> Duration { + timed(|| { + for _ in 0..n { + self.triad(); + } + }) + } + + fn run_nstream(&mut self, n: usize) -> Vec { + (0..n).map(|_| timed(|| self.nstream())).collect::>() + } +} diff --git a/src/rust/rust-stream/src/unsafe_stream.rs b/src/rust/rust-stream/src/unsafe_stream.rs new file mode 100644 index 0000000..968cc4e --- /dev/null +++ b/src/rust/rust-stream/src/unsafe_stream.rs @@ -0,0 +1,266 @@ +extern crate core_affinity; + +use std::alloc::Allocator; +use std::iter::Sum; +use std::ops::Range; + +use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; + +use self::core_affinity::CoreId; + +#[derive(Debug, Copy, Clone)] +struct UnsafeData(*mut T, usize); + +impl UnsafeData { + fn empty() -> UnsafeData { UnsafeData(([] as [T; 0]).as_mut_ptr(), 0) } + fn new(xs: &mut Vec) -> UnsafeData { + UnsafeData(xs.as_mut_ptr(), xs.len()) + } + + fn get_slice(&self) -> &mut [T] { unsafe { std::slice::from_raw_parts_mut(self.0, self.1) } } +} + +unsafe impl Send for UnsafeData {} +unsafe impl Sync for UnsafeData {} + +#[derive(Debug, Copy, Clone)] +struct UnsafeRefs { + a: UnsafeData, + b: UnsafeData, + c: UnsafeData, +} + +unsafe impl Send for UnsafeRefs {} +unsafe impl Sync for UnsafeRefs {} + +pub struct UnsafeDevice { + pub(crate) ncore: usize, + pub(crate) pin: bool, + pub(crate) core_ids: Vec, + data: UnsafeRefs, +} + +impl UnsafeDevice { + pub fn new(ncore: usize, pin: bool) -> Self { + let mut core_ids = match core_affinity::get_core_ids() { + Some(xs) => xs, + None => { + colour::e_red_ln!("Cannot enumerate cores, pinning will not work if enabled"); + (0..ncore).map(|i| CoreId { id: i }).collect() + } + }; + core_ids.resize(ncore, core_ids[0]); + + UnsafeDevice { + ncore, + pin, + core_ids, + data: UnsafeRefs { a: UnsafeData::empty(), b: UnsafeData::empty(), c: UnsafeData::empty() }, + } + } + + fn thread_ranges(&self, len: usize) -> Vec<(usize, Range)> { + let chunk = (len as f64 / self.ncore as f64).ceil() as usize; + (0..self.ncore) + .map(|t| { + (t, if t == self.ncore - 1 { (t * chunk)..len } else { (t * chunk)..((t + 1) * chunk) }) + }) + .collect::>() + } +} + +// Unsafe threaded version, it should be semantically equal to the single threaded version +impl RustStream + for StreamData, A> +{ + fn init_arrays(&mut self) { + self.device.data.a = UnsafeData::new(&mut self.a); + self.device.data.b = UnsafeData::new(&mut self.b); + self.device.data.c = UnsafeData::new(&mut self.c); + let init = self.init; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + a[i] = init.0; + b[i] = init.1; + c[i] = init.2; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn copy(&mut self) { + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let c = data.c.get_slice(); + for i in r { + c[i] = a[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn mul(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + b[i] = scalar * c[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn add(&mut self) { + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + c[i] = a[i] + b[i]; + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn triad(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + a[i] = b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn nstream(&mut self) { + let scalar = self.scalar; + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let c = data.c.get_slice(); + for i in r { + a[i] += b[i] + scalar * c[i] + } + }) + }) + .collect::>() + .into_iter() + .for_each(|t| t.join().unwrap()); + } + + fn dot(&mut self) -> T { + let pin = self.device.pin; + let data = self.device.data; + self + .device + .thread_ranges(self.size) + .into_iter() + .map(|(t, r)| { + let core = self.device.core_ids[t]; + std::thread::spawn(move || { + if pin { + core_affinity::set_for_current(core); + } + let a = data.a.get_slice(); + let b = data.b.get_slice(); + let mut p = T::default(); + for i in r { + p += a[i] * b[i]; + } + p + }) + }) + .collect::>() + .into_iter() + .map(|t| t.join().unwrap()) + .sum() + } +} diff --git a/src/rust/rust-stream/tests/integration_test.rs b/src/rust/rust-stream/tests/integration_test.rs new file mode 100644 index 0000000..8031a79 --- /dev/null +++ b/src/rust/rust-stream/tests/integration_test.rs @@ -0,0 +1,17 @@ +use rstest::rstest; + +#[rstest] +fn test_main( + #[values(0, 1, 2, 3, 4)] device: usize, // + #[values("", "--pin")] pin: &str, // + #[values("", "--malloc")] malloc: &str, // + #[values("", "--init")] init: &str, // + #[values("", "--triad-only", "--nstream-only")] option: &str, // +) { + let line = format!( + "rust-stream --arraysize 2048 --device {} {} {} {} {}", + device, pin, malloc, init, option + ); + let args = line.split_whitespace().map(|s| s.to_string()).collect::>(); + assert!(rust_stream::run(&args)); +}