From f7b54703d07ab89e92157706440e75104c4790aa Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Sun, 12 Oct 2014 17:25:51 -0700
Subject: [PATCH] bench: Improve the spectralnorm shootout benchmark

This improves the spectralnorm shootout benchmark through a few vectors after
looking at the leading C implementation:

* The simd-based f64x2 is now used to parallelize a few computations
* RWLock usage has been removed. A custom `parallel` function was added as a
  form of stack-based fork-join parallelism. I found that the contention on the
  locks was high as well as hindering other optimizations.

This does, however, introduce one `unsafe` block into the benchmarks, which
previously had none.

In terms of timings, the before and after numbers are:

```
$ time ./shootout-spectralnorm-before
./shootout-spectralnorm-before  2.07s user 0.71s system 324% cpu 0.857 total
$ time ./shootout-spectralnorm-before 5500
./shootout-spectralnorm-before 5500  11.88s user 1.13s system 459% cpu 2.830 total
$ time ./shootout-spectralnorm-after
./shootout-spectralnorm-after  0.58s user 0.01s system 280% cpu 0.210 tota
$ time ./shootout-spectralnorm-after 5500
./shootout-spectralnorm-after 5500  3.55s user 0.01s system 455% cpu 0.783 total
```
---
 src/test/bench/shootout-spectralnorm.rs | 147 +++++++++++-------------
 1 file changed, 68 insertions(+), 79 deletions(-)
diff --git a/src/test/bench/shootout-spectralnorm.rs b/src/test/bench/shootout-spectralnorm.rs
index 685e900b37eee..627742852b263 100644
--- a/src/test/bench/shootout-spectralnorm.rs
+++ b/src/test/bench/shootout-spectralnorm.rs
@@ -41,105 +41,94 @@
 // no-pretty-expanded FIXME #15189
 
 #![allow(non_snake_case)]
+#![feature(unboxed_closures, overloaded_calls)]
 
-use std::from_str::FromStr;
-use std::iter::count;
-use std::cmp::min;
+use std::iter::AdditiveIterator;
+use std::mem;
 use std::os;
-use std::sync::{Arc, RWLock};
+use std::raw::Repr;
+use std::simd::f64x2;
 
-fn A(i: uint, j: uint) -> f64 {
-    ((i + j) * (i + j + 1) / 2 + i + 1) as f64
+fn main() {
+    let args = os::args();
+    let answer = spectralnorm(if os::getenv("RUST_BENCH").is_some() {
+        5500
+    } else if args.len() < 2 {
+        2000
+    } else {
+        from_str(args[1].as_slice()).unwrap()
+    });
+    println!("{:.9f}", answer);
 }
 
-fn dot(v: &[f64], u: &[f64]) -> f64 {
-    let mut sum = 0.0;
-    for (&v_i, &u_i) in v.iter().zip(u.iter()) {
-        sum += v_i * u_i;
+fn spectralnorm(n: uint) -> f64 {
+    assert!(n % 2 == 0, "only even lengths are accepted");
+    let mut u = Vec::from_elem(n, 1.0);
+    let mut v = Vec::from_elem(n, 1.0);
+    let mut tmp = Vec::from_elem(n, 1.0);
+    for _ in range(0u, 10) {
+        mult_AtAv(u.as_slice(), v.as_mut_slice(), tmp.as_mut_slice());
+        mult_AtAv(v.as_slice(), u.as_mut_slice(), tmp.as_mut_slice());
     }
-    sum
+    (dot(u.as_slice(), v.as_slice()) / dot(v.as_slice(), v.as_slice())).sqrt()
 }
 
-fn mult(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>,
-        f: fn(&Vec<f64>, uint) -> f64) {
-    // We launch in different tasks the work to be done.  To finish
-    // this function, we need to wait for the completion of every
-    // tasks.  To do that, we give to each tasks a wait_chan that we
-    // drop at the end of the work.  At the end of this function, we
-    // wait until the channel hang up.
-    let (tx, rx) = channel();
-
-    let len = out.read().len();
-    let chunk = len / 20 + 1;
-    for chk in count(0, chunk) {
-        if chk >= len {break;}
-        let tx = tx.clone();
-        let v = v.clone();
-        let out = out.clone();
-        spawn(proc() {
-            for i in range(chk, min(len, chk + chunk)) {
-                let val = f(&*v.read(), i);
-                *out.write().get_mut(i) = val;
-            }
-            drop(tx)
-        });
-    }
-
-    // wait until the channel hang up (every task finished)
-    drop(tx);
-    for () in rx.iter() {}
+fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
+    mult_Av(v, tmp);
+    mult_Atv(tmp, out);
 }
 
-fn mult_Av_impl(v: &Vec<f64> , i: uint) -> f64 {
-    let mut sum = 0.;
-    for (j, &v_j) in v.iter().enumerate() {
-        sum += v_j / A(i, j);
-    }
-    sum
+fn mult_Av(v: &[f64], out: &mut [f64]) {
+    parallel(out, |&: start, out| mult(v, out, start, |i, j| A(i, j)));
 }
 
-fn mult_Av(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>) {
-    mult(v, out, mult_Av_impl);
+fn mult_Atv(v: &[f64], out: &mut [f64]) {
+    parallel(out, |&: start, out| mult(v, out, start, |i, j| A(j, i)));
 }
 
-fn mult_Atv_impl(v: &Vec<f64> , i: uint) -> f64 {
-    let mut sum = 0.;
-    for (j, &v_j) in v.iter().enumerate() {
-        sum += v_j / A(j, i);
+fn mult(v: &[f64], out: &mut [f64], start: uint, a: |uint, uint| -> f64) {
+    for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) {
+        let mut sum = f64x2(0.0, 0.0);
+        for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {
+            let top = f64x2(chunk[0], chunk[1]);
+            let bot = f64x2(a(i, j), a(i, j + 1));
+            sum += top / bot;
+        }
+        let f64x2(a, b) = sum;
+        *slot = a + b;
     }
-    sum
 }
 
-fn mult_Atv(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>) {
-    mult(v, out, mult_Atv_impl);
+fn A(i: uint, j: uint) -> f64 {
+    ((i + j) * (i + j + 1) / 2 + i + 1) as f64
 }
 
-fn mult_AtAv(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>,
-             tmp: Arc<RWLock<Vec<f64>>>) {
-    mult_Av(v, tmp.clone());
-    mult_Atv(tmp, out);
+fn dot(v: &[f64], u: &[f64]) -> f64 {
+    v.iter().zip(u.iter()).map(|(a, b)| *a * *b).sum()
 }
 
-fn main() {
-    let args = os::args();
-    let args = args.as_slice();
-    let n = if os::getenv("RUST_BENCH").is_some() {
-        5500
-    } else if args.len() < 2 {
-        2000
-    } else {
-        FromStr::from_str(args[1].as_slice()).unwrap()
-    };
-    let u = Arc::new(RWLock::new(Vec::from_elem(n, 1f64)));
-    let v = Arc::new(RWLock::new(Vec::from_elem(n, 1f64)));
-    let tmp = Arc::new(RWLock::new(Vec::from_elem(n, 1f64)));
-    for _ in range(0u8, 10) {
-        mult_AtAv(u.clone(), v.clone(), tmp.clone());
-        mult_AtAv(v.clone(), u.clone(), tmp.clone());
-    }
+// Executes a closure in parallel over the given mutable slice. The closure `f`
+// is run in parallel and yielded the starting index within `v` as well as a
+// sub-slice of `v`.
+fn parallel<'a, T, F>(v: &'a mut [T], f: F)
+                      where T: Send + Sync,
+                            F: Fn(uint, &'a mut [T]) + Sync {
+    let (tx, rx) = channel();
+    let size = v.len() / os::num_cpus() + 1;
 
-    let u = u.read();
-    let v = v.read();
-    println!("{:.9f}", (dot(u.as_slice(), v.as_slice()) /
-                        dot(v.as_slice(), v.as_slice())).sqrt());
+    for (i, chunk) in v.chunks_mut(size).enumerate() {
+        let tx = tx.clone();
+
+        // Need to convert `f` and `chunk` to something that can cross the task
+        // boundary.
+        let f = &f as *const _ as *const uint;
+        let raw = chunk.repr();
+        spawn(proc() {
+            let f = f as *const F;
+            unsafe { (*f)(i * size, mem::transmute(raw)) }
+            drop(tx)
+        });
+    }
+    drop(tx);
+    for () in rx.iter() {}
 }