Skip to content

Improve shootout-reverse-complement #18143

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
339 changes: 256 additions & 83 deletions src/test/bench/shootout-reverse-complement.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,113 +38,286 @@
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
// OF THE POSSIBILITY OF SUCH DAMAGE.

// ignore-pretty very bad with line comments
// ignore-android doesn't terminate?

#![feature(slicing_syntax)]
#![feature(slicing_syntax, asm, if_let, tuple_indexing)]

use std::iter::range_step;
use std::io::{stdin, stdout, File};
extern crate libc;

static LINE_LEN: uint = 60;
use std::io::stdio::{stdin_raw, stdout_raw};
use std::sync::{Future};
use std::num::{div_rem};
use std::ptr::{copy_memory};
use std::io::{IoResult, EndOfFile};
use std::slice::raw::{mut_buf_as_slice};

fn make_complements() -> [u8, ..256] {
let transforms = [
('A', 'T'), ('C', 'G'), ('G', 'C'), ('T', 'A'),
('U', 'A'), ('M', 'K'), ('R', 'Y'), ('W', 'W'),
('S', 'S'), ('Y', 'R'), ('K', 'M'), ('V', 'B'),
('H', 'D'), ('D', 'H'), ('B', 'V'), ('N', 'N'),
('\n', '\n')];
let mut complements: [u8, ..256] = [0, ..256];
for (i, c) in complements.iter_mut().enumerate() {
*c = i as u8;
use shared_memory::{SharedMemory};

mod tables {
use std::sync::{Once, ONCE_INIT};

/// Lookup tables.
static mut CPL16: [u16, ..1 << 16] = [0, ..1 << 16];
static mut CPL8: [u8, ..1 << 8] = [0, ..1 << 8];

/// Generates the tables.
pub fn get() -> Tables {
/// To make sure we initialize the tables only once.
static INIT: Once = ONCE_INIT;
INIT.doit(|| {
unsafe {
for i in range(0, 1 << 8) {
CPL8[i] = match i as u8 {
b'A' | b'a' => b'T',
b'C' | b'c' => b'G',
b'G' | b'g' => b'C',
b'T' | b't' => b'A',
b'U' | b'u' => b'A',
b'M' | b'm' => b'K',
b'R' | b'r' => b'Y',
b'W' | b'w' => b'W',
b'S' | b's' => b'S',
b'Y' | b'y' => b'R',
b'K' | b'k' => b'M',
b'V' | b'v' => b'B',
b'H' | b'h' => b'D',
b'D' | b'd' => b'H',
b'B' | b'b' => b'V',
b'N' | b'n' => b'N',
i => i,
};
}

for (i, v) in CPL16.iter_mut().enumerate() {
*v = *CPL8.unsafe_get(i & 255) as u16 << 8 |
*CPL8.unsafe_get(i >> 8) as u16;
}
}
});
Tables { _dummy: () }
}
let lower = 'A' as u8 - 'a' as u8;
for &(from, to) in transforms.iter() {
complements[from as uint] = to as u8;
complements[(from as u8 - lower) as uint] = to as u8;

/// Accessor for the static arrays.
///
/// To make sure that the tables can't be accessed without having been initialized.
pub struct Tables {
_dummy: ()
}

impl Tables {
/// Retreives the complement for `i`.
pub fn cpl8(self, i: u8) -> u8 {
// Not really unsafe.
unsafe { CPL8[i as uint] }
}

/// Retreives the complement for `i`.
pub fn cpl16(self, i: u16) -> u16 {
unsafe { CPL16[i as uint] }
}
}
complements
}

fn main() {
let complements = make_complements();
let data = if std::os::getenv("RUST_BENCH").is_some() {
File::open(&Path::new("shootout-k-nucleotide.data")).read_to_end()
} else {
stdin().read_to_end()
};
let mut data = data.unwrap();
mod shared_memory {
use std::sync::{Arc};
use std::mem::{transmute};
use std::raw::{Slice};

for seq in data.as_mut_slice().split_mut(|c| *c == '>' as u8) {
// skip header and last \n
let begin = match seq.iter().position(|c| *c == '\n' as u8) {
None => continue,
Some(c) => c
};
let len = seq.len();
let seq = seq[mut begin+1..len-1];

// arrange line breaks
let len = seq.len();
let off = LINE_LEN - len % (LINE_LEN + 1);
for i in range_step(LINE_LEN, len, LINE_LEN + 1) {
for j in std::iter::count(i, -1).take(off) {
seq[j] = seq[j - 1];
/// Structure for sharing disjoint parts of a vector mutably across tasks.
pub struct SharedMemory {
ptr: Arc<Vec<u8>>,
start: uint,
len: uint,
}

impl SharedMemory {
pub fn new(ptr: Vec<u8>) -> SharedMemory {
let len = ptr.len();
SharedMemory {
ptr: Arc::new(ptr),
start: 0,
len: len,
}
seq[i - off] = '\n' as u8;
}

// reverse complement, as
// seq.reverse(); for c in seq.iter_mut() { *c = complements[*c] }
// but faster:
for (front, back) in two_side_iter(seq) {
let tmp = complements[*front as uint];
*front = complements[*back as uint];
*back = tmp;
pub fn as_mut_slice(&mut self) -> &mut [u8] {
unsafe {
transmute(Slice {
data: self.ptr.as_ptr().offset(self.start as int) as *const u8,
len: self.len,
})
}
}
if seq.len() % 2 == 1 {
let middle = &mut seq[seq.len() / 2];
*middle = complements[*middle as uint];

pub fn len(&self) -> uint {
self.len
}
}

stdout().write(data.as_slice()).unwrap();
pub fn split_at(self, mid: uint) -> (SharedMemory, SharedMemory) {
assert!(mid <= self.len);
let left = SharedMemory {
ptr: self.ptr.clone(),
start: self.start,
len: mid,
};
let right = SharedMemory {
ptr: self.ptr,
start: self.start + mid,
len: self.len - mid,
};
(left, right)
}

/// Resets the object so that it covers the whole range of the contained vector.
///
/// You must not call this method if `self` is not the only reference to the
/// shared memory.
///
/// FIXME: If `Arc` had a method to check if the reference is unique, then we
/// wouldn't need the `unsafe` here.
///
/// FIXME: If `Arc` had a method to unwrap the contained value, then we could
/// simply unwrap here.
pub unsafe fn reset(self) -> SharedMemory {
let len = self.ptr.len();
SharedMemory {
ptr: self.ptr,
start: 0,
len: len,
}
}
}
}

pub struct TwoSideIter<'a, T: 'a> {
first: *mut T,
last: *mut T,
marker: std::kinds::marker::ContravariantLifetime<'a>,
marker2: std::kinds::marker::NoCopy

/// Reads all remaining bytes from the stream.
fn read_to_end<R: Reader>(r: &mut R) -> IoResult<Vec<u8>> {
const CHUNK: uint = 64 * 1024;

let mut vec = Vec::with_capacity(1024 * 1024);
loop {
if vec.capacity() - vec.len() < CHUNK {
let cap = vec.capacity();
let mult = if cap < 256 * 1024 * 1024 {
// FIXME (mahkoh): Temporary workaround for jemalloc on linux. Replace
// this by 2x once the jemalloc preformance issue has been fixed.
16
} else {
2
};
vec.reserve_exact(mult * cap);
}
unsafe {
let ptr = vec.as_mut_ptr().offset(vec.len() as int);
match mut_buf_as_slice(ptr, CHUNK, |s| r.read(s)) {
Ok(n) => {
let len = vec.len();
vec.set_len(len + n);
},
Err(ref e) if e.kind == EndOfFile => break,
Err(e) => return Err(e),
}
}
}
Ok(vec)
}

pub fn two_side_iter<'a, T>(slice: &'a mut [T]) -> TwoSideIter<'a, T> {
let len = slice.len();
let first = slice.as_mut_ptr();
let last = if len == 0 {
first
} else {
unsafe { first.offset(len as int - 1) }
/// Finds the first position at which `b` occurs in `s`.
fn memchr(h: &[u8], n: u8) -> Option<uint> {
use libc::{c_void, c_int, size_t};
extern {
fn memchr(h: *const c_void, n: c_int, s: size_t) -> *mut c_void;
}
let res = unsafe {
memchr(h.as_ptr() as *const c_void, n as c_int, h.len() as size_t)
};

TwoSideIter {
first: first,
last: last,
marker: std::kinds::marker::ContravariantLifetime,
marker2: std::kinds::marker::NoCopy
if res.is_null() {
None
} else {
Some(res as uint - h.as_ptr() as uint)
}
}

impl<'a, T> Iterator<(&'a mut T, &'a mut T)> for TwoSideIter<'a, T> {
fn next(&mut self) -> Option<(&'a mut T, &'a mut T)> {
if self.first < self.last {
let result = unsafe { (&mut *self.first, &mut *self.last) };
self.first = unsafe { self.first.offset(1) };
self.last = unsafe { self.last.offset(-1) };
Some(result)
} else {
None
/// Length of a normal line without the terminating \n.
const LINE_LEN: uint = 60;

/// Compute the reverse complement.
fn reverse_complement(mut view: SharedMemory, tables: tables::Tables) {
// Drop the last newline
let seq = view.as_mut_slice().init_mut();
let len = seq.len();
let off = LINE_LEN - len % (LINE_LEN + 1);
let mut i = LINE_LEN;
while i < len {
unsafe {
copy_memory(seq.as_mut_ptr().offset((i - off + 1) as int),
seq.as_ptr().offset((i - off) as int), off);
*seq.unsafe_mut(i - off) = b'\n';
}
i += LINE_LEN + 1;
}

let (div, rem) = div_rem(len, 4);
unsafe {
let mut left = seq.as_mut_ptr() as *mut u16;
// This is slow if len % 2 != 0 but still faster than bytewise operations.
let mut right = seq.as_mut_ptr().offset(len as int - 2) as *mut u16;
let end = left.offset(div as int);
while left != end {
let tmp = tables.cpl16(*left);
*left = tables.cpl16(*right);
*right = tmp;
left = left.offset(1);
right = right.offset(-1);
}

let end = end as *mut u8;
match rem {
1 => *end = tables.cpl8(*end),
2 => {
let tmp = tables.cpl8(*end);
*end = tables.cpl8(*end.offset(1));
*end.offset(1) = tmp;
},
3 => {
*end.offset(1) = tables.cpl8(*end.offset(1));
let tmp = tables.cpl8(*end);
*end = tables.cpl8(*end.offset(2));
*end.offset(2) = tmp;
},
_ => { },
}
}
}

fn main() {
let mut data = SharedMemory::new(read_to_end(&mut stdin_raw()).unwrap());
let tables = tables::get();

let mut futures = vec!();
loop {
let (_, mut tmp_data) = match memchr(data.as_mut_slice(), b'\n') {
Some(i) => data.split_at(i + 1),
_ => break,
};
let (view, tmp_data) = match memchr(tmp_data.as_mut_slice(), b'>') {
Some(i) => tmp_data.split_at(i),
None => {
let len = tmp_data.len();
tmp_data.split_at(len)
},
};
futures.push(Future::spawn(proc() reverse_complement(view, tables)));
data = tmp_data;
}

for f in futures.iter_mut() {
f.get();
}

// Not actually unsafe. If Arc had a way to check uniqueness then we could do that in
// `reset` and it would tell us that, yes, it is unique at this point.
data = unsafe { data.reset() };

stdout_raw().write(data.as_mut_slice()).unwrap();
}