From ea6c1b721a88482354d8236f6ad25aac41e60e57 Mon Sep 17 00:00:00 2001 From: Ayose Date: Thu, 18 Dec 2014 19:45:30 +0000 Subject: [PATCH 1/5] Benchmark for preprocess function --- src/tokenizer.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index a03c925d..4a5e0450 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -42,8 +42,21 @@ fn preprocess(input: &str) -> String { #[test] fn test_preprocess() { assert!("" == preprocess("").as_slice()); - assert!("Lorem\n\t\u{FFFD}ipusm\ndoror\u{FFFD}\n" == - preprocess("Lorem\r\n\t\x00ipusm\ndoror\u{FFFD}\r").as_slice()); + assert!("Lorem\n\n\t\u{FFFD}ipusm\ndoror\u{FFFD}á\n" == + preprocess("Lorem\r\n\n\t\x00ipusm\ndoror\u{FFFD}á\r").as_slice()); +} + +#[cfg(test)] +mod bench_preprocess { + extern crate test; + + #[bench] + fn bench_preprocess(b: &mut test::Bencher) { + let source = "Lorem\n\t\u{FFFD}ipusm\ndoror\u{FFFD}á\n"; + b.iter(|| { + let _ = super::preprocess(source); + }); + } } From 9b4ec492f252c8075e5b2afb63a54ad0f9dfdc42 Mon Sep 17 00:00:00 2001 From: Ayose Date: Thu, 18 Dec 2014 19:45:58 +0000 Subject: [PATCH 2/5] preprocess function in one pass --- src/tokenizer.rs | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4a5e0450..f2f7f9a7 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -34,8 +34,44 @@ impl Iterator for Tokenizer { #[inline] fn preprocess(input: &str) -> String { - // TODO: Is this faster if done in one pass? - input.replace("\r\n", "\n").replace("\r", "\n").replace("\x0C", "\n").replace("\x00", "\u{FFFD}") + let bytes = input.as_bytes(); + let mut result: Vec = Vec::with_capacity(bytes.len()); + let mut last: u8 = 0; + let mut offset: uint = 0; + while offset < bytes.len() { + let byte = bytes[offset]; + match byte { + b'\r' => result.push(b'\n'), + b'\n' if last == 13 => (), + b'\n' => result.push(b'\n'), + b'\x0C' => result.push(b'\n'), + b'\0' => result.push_all("\u{FFFD}".as_bytes()), + _ if byte < 128 => result.push(byte), + _ => { + // Multi-byte character + result.push(byte); + let remaining = bytes.len() - offset; + if remaining >= 3 && byte >= 0xF0 { + result.push(bytes[offset + 1]); + result.push(bytes[offset + 2]); + result.push(bytes[offset + 3]); + offset += 3; + } else if remaining >= 2 && byte >= 0xE0 { + result.push(bytes[offset + 1]); + result.push(bytes[offset + 2]); + offset += 2; + } else if remaining >= 1 && byte >= 0xC0 { + result.push(bytes[offset + 1]); + offset += 1; + } + } + } + + last = byte; + offset += 1; + } + + String::from_utf8(result).unwrap() } From 5f461804876b91eafa8b9cbe946dfd1e08374cd1 Mon Sep 17 00:00:00 2001 From: Ayose Date: Thu, 18 Dec 2014 20:40:19 +0000 Subject: [PATCH 3/5] Reduce match branches --- src/tokenizer.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f2f7f9a7..7f8bda31 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -34,6 +34,12 @@ impl Iterator for Tokenizer { #[inline] fn preprocess(input: &str) -> String { + // Replace: + // "\r\n" => "\n" + // "\r" => "\n" + // "\x0C" => "\n" + // "\x00" => "\u{FFFD}" + let bytes = input.as_bytes(); let mut result: Vec = Vec::with_capacity(bytes.len()); let mut last: u8 = 0; @@ -41,13 +47,11 @@ fn preprocess(input: &str) -> String { while offset < bytes.len() { let byte = bytes[offset]; match byte { - b'\r' => result.push(b'\n'), - b'\n' if last == 13 => (), - b'\n' => result.push(b'\n'), - b'\x0C' => result.push(b'\n'), - b'\0' => result.push_all("\u{FFFD}".as_bytes()), - _ if byte < 128 => result.push(byte), - _ => { + b'\n' if last == b'\r' => (), + b'\r' | b'\n' | b'\x0C' => result.push(b'\n'), + b'\0' => result.push_all("\u{FFFD}".as_bytes()), + _ if byte < 128 => result.push(byte), + _ => { // Multi-byte character result.push(byte); let remaining = bytes.len() - offset; From 0656606194de7f22f6af86118e27ab2b11ff1f75 Mon Sep 17 00:00:00 2001 From: Ayose Date: Thu, 18 Dec 2014 22:36:22 +0000 Subject: [PATCH 4/5] Remove redundant UTF-8 check --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7f8bda31..ba730c63 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -75,7 +75,7 @@ fn preprocess(input: &str) -> String { offset += 1; } - String::from_utf8(result).unwrap() + unsafe { String::from_utf8_unchecked(result) } } From 2f08b15c621e77e6446b14b2a3134c07b012a408 Mon Sep 17 00:00:00 2001 From: Ayose Date: Thu, 18 Dec 2014 22:43:03 +0000 Subject: [PATCH 5/5] Simplify match block removing code for multi-byte chars --- src/tokenizer.rs | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ba730c63..519d8603 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -43,36 +43,15 @@ fn preprocess(input: &str) -> String { let bytes = input.as_bytes(); let mut result: Vec = Vec::with_capacity(bytes.len()); let mut last: u8 = 0; - let mut offset: uint = 0; - while offset < bytes.len() { - let byte = bytes[offset]; - match byte { - b'\n' if last == b'\r' => (), - b'\r' | b'\n' | b'\x0C' => result.push(b'\n'), - b'\0' => result.push_all("\u{FFFD}".as_bytes()), - _ if byte < 128 => result.push(byte), - _ => { - // Multi-byte character - result.push(byte); - let remaining = bytes.len() - offset; - if remaining >= 3 && byte >= 0xF0 { - result.push(bytes[offset + 1]); - result.push(bytes[offset + 2]); - result.push(bytes[offset + 3]); - offset += 3; - } else if remaining >= 2 && byte >= 0xE0 { - result.push(bytes[offset + 1]); - result.push(bytes[offset + 2]); - offset += 2; - } else if remaining >= 1 && byte >= 0xC0 { - result.push(bytes[offset + 1]); - offset += 1; - } - } + for byte in bytes.iter() { + match *byte { + b'\n' if last == b'\r' => (), + b'\r' | b'\x0C' => result.push(b'\n'), + b'\0' => result.push_all("\u{FFFD}".as_bytes()), + _ => result.push(*byte), } - last = byte; - offset += 1; + last = *byte; } unsafe { String::from_utf8_unchecked(result) }