From f7b54703d07ab89e92157706440e75104c4790aa Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Sun, 12 Oct 2014 17:25:51 -0700 Subject: [PATCH] bench: Improve the spectralnorm shootout benchmark This improves the spectralnorm shootout benchmark through a few vectors after looking at the leading C implementation: * The simd-based f64x2 is now used to parallelize a few computations * RWLock usage has been removed. A custom `parallel` function was added as a form of stack-based fork-join parallelism. I found that the contention on the locks was high as well as hindering other optimizations. This does, however, introduce one `unsafe` block into the benchmarks, which previously had none. In terms of timings, the before and after numbers are: ``` $ time ./shootout-spectralnorm-before ./shootout-spectralnorm-before 2.07s user 0.71s system 324% cpu 0.857 total $ time ./shootout-spectralnorm-before 5500 ./shootout-spectralnorm-before 5500 11.88s user 1.13s system 459% cpu 2.830 total $ time ./shootout-spectralnorm-after ./shootout-spectralnorm-after 0.58s user 0.01s system 280% cpu 0.210 tota $ time ./shootout-spectralnorm-after 5500 ./shootout-spectralnorm-after 5500 3.55s user 0.01s system 455% cpu 0.783 total ``` --- src/test/bench/shootout-spectralnorm.rs | 147 +++++++++++------------- 1 file changed, 68 insertions(+), 79 deletions(-) diff --git a/src/test/bench/shootout-spectralnorm.rs b/src/test/bench/shootout-spectralnorm.rs index 685e900b37eee..627742852b263 100644 --- a/src/test/bench/shootout-spectralnorm.rs +++ b/src/test/bench/shootout-spectralnorm.rs @@ -41,105 +41,94 @@ // no-pretty-expanded FIXME #15189 #![allow(non_snake_case)] +#![feature(unboxed_closures, overloaded_calls)] -use std::from_str::FromStr; -use std::iter::count; -use std::cmp::min; +use std::iter::AdditiveIterator; +use std::mem; use std::os; -use std::sync::{Arc, RWLock}; +use std::raw::Repr; +use std::simd::f64x2; -fn A(i: uint, j: uint) -> f64 { - ((i + j) * (i + j + 1) / 2 + i + 1) as f64 +fn main() { + let args = os::args(); + let answer = spectralnorm(if os::getenv("RUST_BENCH").is_some() { + 5500 + } else if args.len() < 2 { + 2000 + } else { + from_str(args[1].as_slice()).unwrap() + }); + println!("{:.9f}", answer); } -fn dot(v: &[f64], u: &[f64]) -> f64 { - let mut sum = 0.0; - for (&v_i, &u_i) in v.iter().zip(u.iter()) { - sum += v_i * u_i; +fn spectralnorm(n: uint) -> f64 { + assert!(n % 2 == 0, "only even lengths are accepted"); + let mut u = Vec::from_elem(n, 1.0); + let mut v = Vec::from_elem(n, 1.0); + let mut tmp = Vec::from_elem(n, 1.0); + for _ in range(0u, 10) { + mult_AtAv(u.as_slice(), v.as_mut_slice(), tmp.as_mut_slice()); + mult_AtAv(v.as_slice(), u.as_mut_slice(), tmp.as_mut_slice()); } - sum + (dot(u.as_slice(), v.as_slice()) / dot(v.as_slice(), v.as_slice())).sqrt() } -fn mult(v: Arc>>, out: Arc>>, - f: fn(&Vec, uint) -> f64) { - // We launch in different tasks the work to be done. To finish - // this function, we need to wait for the completion of every - // tasks. To do that, we give to each tasks a wait_chan that we - // drop at the end of the work. At the end of this function, we - // wait until the channel hang up. - let (tx, rx) = channel(); - - let len = out.read().len(); - let chunk = len / 20 + 1; - for chk in count(0, chunk) { - if chk >= len {break;} - let tx = tx.clone(); - let v = v.clone(); - let out = out.clone(); - spawn(proc() { - for i in range(chk, min(len, chk + chunk)) { - let val = f(&*v.read(), i); - *out.write().get_mut(i) = val; - } - drop(tx) - }); - } - - // wait until the channel hang up (every task finished) - drop(tx); - for () in rx.iter() {} +fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) { + mult_Av(v, tmp); + mult_Atv(tmp, out); } -fn mult_Av_impl(v: &Vec , i: uint) -> f64 { - let mut sum = 0.; - for (j, &v_j) in v.iter().enumerate() { - sum += v_j / A(i, j); - } - sum +fn mult_Av(v: &[f64], out: &mut [f64]) { + parallel(out, |&: start, out| mult(v, out, start, |i, j| A(i, j))); } -fn mult_Av(v: Arc>>, out: Arc>>) { - mult(v, out, mult_Av_impl); +fn mult_Atv(v: &[f64], out: &mut [f64]) { + parallel(out, |&: start, out| mult(v, out, start, |i, j| A(j, i))); } -fn mult_Atv_impl(v: &Vec , i: uint) -> f64 { - let mut sum = 0.; - for (j, &v_j) in v.iter().enumerate() { - sum += v_j / A(j, i); +fn mult(v: &[f64], out: &mut [f64], start: uint, a: |uint, uint| -> f64) { + for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) { + let mut sum = f64x2(0.0, 0.0); + for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) { + let top = f64x2(chunk[0], chunk[1]); + let bot = f64x2(a(i, j), a(i, j + 1)); + sum += top / bot; + } + let f64x2(a, b) = sum; + *slot = a + b; } - sum } -fn mult_Atv(v: Arc>>, out: Arc>>) { - mult(v, out, mult_Atv_impl); +fn A(i: uint, j: uint) -> f64 { + ((i + j) * (i + j + 1) / 2 + i + 1) as f64 } -fn mult_AtAv(v: Arc>>, out: Arc>>, - tmp: Arc>>) { - mult_Av(v, tmp.clone()); - mult_Atv(tmp, out); +fn dot(v: &[f64], u: &[f64]) -> f64 { + v.iter().zip(u.iter()).map(|(a, b)| *a * *b).sum() } -fn main() { - let args = os::args(); - let args = args.as_slice(); - let n = if os::getenv("RUST_BENCH").is_some() { - 5500 - } else if args.len() < 2 { - 2000 - } else { - FromStr::from_str(args[1].as_slice()).unwrap() - }; - let u = Arc::new(RWLock::new(Vec::from_elem(n, 1f64))); - let v = Arc::new(RWLock::new(Vec::from_elem(n, 1f64))); - let tmp = Arc::new(RWLock::new(Vec::from_elem(n, 1f64))); - for _ in range(0u8, 10) { - mult_AtAv(u.clone(), v.clone(), tmp.clone()); - mult_AtAv(v.clone(), u.clone(), tmp.clone()); - } +// Executes a closure in parallel over the given mutable slice. The closure `f` +// is run in parallel and yielded the starting index within `v` as well as a +// sub-slice of `v`. +fn parallel<'a, T, F>(v: &'a mut [T], f: F) + where T: Send + Sync, + F: Fn(uint, &'a mut [T]) + Sync { + let (tx, rx) = channel(); + let size = v.len() / os::num_cpus() + 1; - let u = u.read(); - let v = v.read(); - println!("{:.9f}", (dot(u.as_slice(), v.as_slice()) / - dot(v.as_slice(), v.as_slice())).sqrt()); + for (i, chunk) in v.chunks_mut(size).enumerate() { + let tx = tx.clone(); + + // Need to convert `f` and `chunk` to something that can cross the task + // boundary. + let f = &f as *const _ as *const uint; + let raw = chunk.repr(); + spawn(proc() { + let f = f as *const F; + unsafe { (*f)(i * size, mem::transmute(raw)) } + drop(tx) + }); + } + drop(tx); + for () in rx.iter() {} }