diff --git a/src/Cargo.lock b/src/Cargo.lock index b74587e566210..a93390552641c 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -2779,6 +2779,7 @@ name = "syntax_pos" version = "0.0.0" dependencies = [ "arena 0.0.0", + "cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "rustc_data_structures 0.0.0", "scoped-tls 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "serialize 0.0.0", diff --git a/src/librustc/ich/impls_syntax.rs b/src/librustc/ich/impls_syntax.rs index 935bc4c8c6d8c..a7a6a71474f00 100644 --- a/src/librustc/ich/impls_syntax.rs +++ b/src/librustc/ich/impls_syntax.rs @@ -455,27 +455,21 @@ impl<'a> HashStable> for FileMap { src_hash.hash_stable(hcx, hasher); // We only hash the relative position within this filemap - lines.with_lock(|lines| { - lines.len().hash_stable(hcx, hasher); - for &line in lines.iter() { - stable_byte_pos(line, start_pos).hash_stable(hcx, hasher); - } - }); + lines.len().hash_stable(hcx, hasher); + for &line in lines.iter() { + stable_byte_pos(line, start_pos).hash_stable(hcx, hasher); + } // We only hash the relative position within this filemap - multibyte_chars.with_lock(|multibyte_chars| { - multibyte_chars.len().hash_stable(hcx, hasher); - for &char_pos in multibyte_chars.iter() { - stable_multibyte_char(char_pos, start_pos).hash_stable(hcx, hasher); - } - }); + multibyte_chars.len().hash_stable(hcx, hasher); + for &char_pos in multibyte_chars.iter() { + stable_multibyte_char(char_pos, start_pos).hash_stable(hcx, hasher); + } - non_narrow_chars.with_lock(|non_narrow_chars| { - non_narrow_chars.len().hash_stable(hcx, hasher); - for &char_pos in non_narrow_chars.iter() { - stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher); - } - }); + non_narrow_chars.len().hash_stable(hcx, hasher); + for &char_pos in non_narrow_chars.iter() { + stable_non_narrow_char(char_pos, start_pos).hash_stable(hcx, hasher); + } } } diff --git a/src/librustc/ty/query/on_disk_cache.rs b/src/librustc/ty/query/on_disk_cache.rs index 3285380c82392..7aa6f3a55ad74 100644 --- a/src/librustc/ty/query/on_disk_cache.rs +++ b/src/librustc/ty/query/on_disk_cache.rs @@ -623,7 +623,7 @@ impl<'a, 'tcx, 'x> SpecializedDecoder for CacheDecoder<'a, 'tcx, 'x> { let len = BytePos::decode(self)?; let file_lo = self.file_index_to_file(file_lo_index); - let lo = file_lo.lines.borrow()[line_lo - 1] + col_lo; + let lo = file_lo.lines[line_lo - 1] + col_lo; let hi = lo + len; let expn_info_tag = u8::decode(self)?; diff --git a/src/librustc_metadata/decoder.rs b/src/librustc_metadata/decoder.rs index 9e4f695d28fd4..a01e0b608646d 100644 --- a/src/librustc_metadata/decoder.rs +++ b/src/librustc_metadata/decoder.rs @@ -1138,9 +1138,9 @@ impl<'a, 'tcx> CrateMetadata { src_hash, start_pos, end_pos, - lines, - multibyte_chars, - non_narrow_chars, + mut lines, + mut multibyte_chars, + mut non_narrow_chars, name_hash, .. } = filemap_to_import; @@ -1151,15 +1151,12 @@ impl<'a, 'tcx> CrateMetadata { // `CodeMap::new_imported_filemap()` will then translate those // coordinates to their new global frame of reference when the // offset of the FileMap is known. - let mut lines = lines.into_inner(); for pos in &mut lines { *pos = *pos - start_pos; } - let mut multibyte_chars = multibyte_chars.into_inner(); for mbc in &mut multibyte_chars { mbc.pos = mbc.pos - start_pos; } - let mut non_narrow_chars = non_narrow_chars.into_inner(); for swc in &mut non_narrow_chars { *swc = *swc - start_pos; } diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 8e4b7660a1cce..1d5429bdf8f7d 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -211,8 +211,7 @@ impl CodeMap { } } - /// Creates a new filemap without setting its line information. If you don't - /// intend to set the line information yourself, you should use new_filemap_and_lines. + /// Creates a new filemap. /// This does not ensure that only one FileMap exists per file name. pub fn new_filemap(&self, filename: FileName, src: String) -> Lrc { let start_pos = self.next_start_pos(); @@ -247,22 +246,6 @@ impl CodeMap { filemap } - /// Creates a new filemap and sets its line information. - /// This does not ensure that only one FileMap exists per file name. - pub fn new_filemap_and_lines(&self, filename: &Path, src: &str) -> Lrc { - let fm = self.new_filemap(filename.to_owned().into(), src.to_owned()); - let mut byte_pos: u32 = fm.start_pos.0; - for line in src.lines() { - // register the start of this line - fm.next_line(BytePos(byte_pos)); - - // update byte_pos to include this line and the \n at the end - byte_pos += line.len() as u32 + 1; - } - fm - } - - /// Allocates a new FileMap representing a source file from an external /// crate. The source code of such an "imported filemap" is not available, /// but we still know enough to generate accurate debuginfo location @@ -305,9 +288,9 @@ impl CodeMap { external_src: Lock::new(ExternalSource::AbsentOk), start_pos, end_pos, - lines: Lock::new(file_local_lines), - multibyte_chars: Lock::new(file_local_multibyte_chars), - non_narrow_chars: Lock::new(file_local_non_narrow_chars), + lines: file_local_lines, + multibyte_chars: file_local_multibyte_chars, + non_narrow_chars: file_local_non_narrow_chars, name_hash, }); @@ -345,21 +328,22 @@ impl CodeMap { match self.lookup_line(pos) { Ok(FileMapAndLine { fm: f, line: a }) => { let line = a + 1; // Line numbers start at 1 - let linebpos = (*f.lines.borrow())[a]; + let linebpos = f.lines[a]; let linechpos = self.bytepos_to_file_charpos(linebpos); let col = chpos - linechpos; let col_display = { - let non_narrow_chars = f.non_narrow_chars.borrow(); - let start_width_idx = non_narrow_chars + let start_width_idx = f + .non_narrow_chars .binary_search_by_key(&linebpos, |x| x.pos()) .unwrap_or_else(|x| x); - let end_width_idx = non_narrow_chars + let end_width_idx = f + .non_narrow_chars .binary_search_by_key(&pos, |x| x.pos()) .unwrap_or_else(|x| x); let special_chars = end_width_idx - start_width_idx; - let non_narrow: usize = - non_narrow_chars[start_width_idx..end_width_idx] + let non_narrow: usize = f + .non_narrow_chars[start_width_idx..end_width_idx] .into_iter() .map(|x| x.width()) .sum(); @@ -380,12 +364,12 @@ impl CodeMap { } Err(f) => { let col_display = { - let non_narrow_chars = f.non_narrow_chars.borrow(); - let end_width_idx = non_narrow_chars + let end_width_idx = f + .non_narrow_chars .binary_search_by_key(&pos, |x| x.pos()) .unwrap_or_else(|x| x); - let non_narrow: usize = - non_narrow_chars[0..end_width_idx] + let non_narrow: usize = f + .non_narrow_chars[0..end_width_idx] .into_iter() .map(|x| x.width()) .sum(); @@ -830,22 +814,22 @@ impl CodeMap { // The number of extra bytes due to multibyte chars in the FileMap let mut total_extra_bytes = 0; - for mbc in map.multibyte_chars.borrow().iter() { + for mbc in map.multibyte_chars.iter() { debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos); if mbc.pos < bpos { // every character is at least one byte, so we only // count the actual extra bytes. - total_extra_bytes += mbc.bytes - 1; + total_extra_bytes += mbc.bytes as u32 - 1; // We should never see a byte position in the middle of a // character - assert!(bpos.to_usize() >= mbc.pos.to_usize() + mbc.bytes); + assert!(bpos.to_u32() >= mbc.pos.to_u32() + mbc.bytes as u32); } else { break; } } - assert!(map.start_pos.to_usize() + total_extra_bytes <= bpos.to_usize()); - CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes) + assert!(map.start_pos.to_u32() + total_extra_bytes <= bpos.to_u32()); + CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes as usize) } // Return the index of the filemap (in self.files) which contains pos. @@ -1028,51 +1012,16 @@ impl FilePathMapping { #[cfg(test)] mod tests { use super::*; - use std::borrow::Cow; use rustc_data_structures::sync::Lrc; - #[test] - fn t1 () { - let cm = CodeMap::new(FilePathMapping::empty()); - let fm = cm.new_filemap(PathBuf::from("blork.rs").into(), - "first line.\nsecond line".to_string()); - fm.next_line(BytePos(0)); - // Test we can get lines with partial line info. - assert_eq!(fm.get_line(0), Some(Cow::from("first line."))); - // TESTING BROKEN BEHAVIOR: line break declared before actual line break. - fm.next_line(BytePos(10)); - assert_eq!(fm.get_line(1), Some(Cow::from("."))); - fm.next_line(BytePos(12)); - assert_eq!(fm.get_line(2), Some(Cow::from("second line"))); - } - - #[test] - #[should_panic] - fn t2 () { - let cm = CodeMap::new(FilePathMapping::empty()); - let fm = cm.new_filemap(PathBuf::from("blork.rs").into(), - "first line.\nsecond line".to_string()); - // TESTING *REALLY* BROKEN BEHAVIOR: - fm.next_line(BytePos(0)); - fm.next_line(BytePos(10)); - fm.next_line(BytePos(2)); - } - fn init_code_map() -> CodeMap { let cm = CodeMap::new(FilePathMapping::empty()); - let fm1 = cm.new_filemap(PathBuf::from("blork.rs").into(), - "first line.\nsecond line".to_string()); - let fm2 = cm.new_filemap(PathBuf::from("empty.rs").into(), - "".to_string()); - let fm3 = cm.new_filemap(PathBuf::from("blork2.rs").into(), - "first line.\nsecond line".to_string()); - - fm1.next_line(BytePos(0)); - fm1.next_line(BytePos(12)); - fm2.next_line(fm2.start_pos); - fm3.next_line(fm3.start_pos); - fm3.next_line(fm3.start_pos + BytePos(12)); - + cm.new_filemap(PathBuf::from("blork.rs").into(), + "first line.\nsecond line".to_string()); + cm.new_filemap(PathBuf::from("empty.rs").into(), + "".to_string()); + cm.new_filemap(PathBuf::from("blork2.rs").into(), + "first line.\nsecond line".to_string()); cm } @@ -1125,26 +1074,10 @@ mod tests { fn init_code_map_mbc() -> CodeMap { let cm = CodeMap::new(FilePathMapping::empty()); // € is a three byte utf8 char. - let fm1 = - cm.new_filemap(PathBuf::from("blork.rs").into(), - "fir€st €€€€ line.\nsecond line".to_string()); - let fm2 = cm.new_filemap(PathBuf::from("blork2.rs").into(), - "first line€€.\n€ second line".to_string()); - - fm1.next_line(BytePos(0)); - fm1.next_line(BytePos(28)); - fm2.next_line(fm2.start_pos); - fm2.next_line(fm2.start_pos + BytePos(20)); - - fm1.record_multibyte_char(BytePos(3), 3); - fm1.record_multibyte_char(BytePos(9), 3); - fm1.record_multibyte_char(BytePos(12), 3); - fm1.record_multibyte_char(BytePos(15), 3); - fm1.record_multibyte_char(BytePos(18), 3); - fm2.record_multibyte_char(fm2.start_pos + BytePos(10), 3); - fm2.record_multibyte_char(fm2.start_pos + BytePos(13), 3); - fm2.record_multibyte_char(fm2.start_pos + BytePos(18), 3); - + cm.new_filemap(PathBuf::from("blork.rs").into(), + "fir€st €€€€ line.\nsecond line".to_string()); + cm.new_filemap(PathBuf::from("blork2.rs").into(), + "first line€€.\n€ second line".to_string()); cm } @@ -1196,7 +1129,7 @@ mod tests { let cm = CodeMap::new(FilePathMapping::empty()); let inputtext = "aaaaa\nbbbbBB\nCCC\nDDDDDddddd\neee\n"; let selection = " \n ~~\n~~~\n~~~~~ \n \n"; - cm.new_filemap_and_lines(Path::new("blork.rs"), inputtext); + cm.new_filemap(Path::new("blork.rs").to_owned().into(), inputtext.to_string()); let span = span_from_selection(inputtext, selection); // check that we are extracting the text we thought we were extracting @@ -1239,7 +1172,7 @@ mod tests { let inputtext = "bbbb BB\ncc CCC\n"; let selection1 = " ~~\n \n"; let selection2 = " \n ~~~\n"; - cm.new_filemap_and_lines(Path::new("blork.rs"), inputtext); + cm.new_filemap(Path::new("blork.rs").to_owned().into(), inputtext.to_owned()); let span1 = span_from_selection(inputtext, selection1); let span2 = span_from_selection(inputtext, selection2); diff --git a/src/libsyntax/ext/expand.rs b/src/libsyntax/ext/expand.rs index 69c99c63aafe3..094e572693cca 100644 --- a/src/libsyntax/ext/expand.rs +++ b/src/libsyntax/ext/expand.rs @@ -1487,9 +1487,11 @@ impl<'a, 'b> Folder for InvocationCollector<'a, 'b> { match String::from_utf8(buf) { Ok(src) => { + let src_interned = Symbol::intern(&src); + // Add this input file to the code map to make it available as // dependency information - self.cx.codemap().new_filemap_and_lines(&filename, &src); + self.cx.codemap().new_filemap(filename.into(), src); let include_info = vec![ dummy_spanned(ast::NestedMetaItemKind::MetaItem( @@ -1497,7 +1499,7 @@ impl<'a, 'b> Folder for InvocationCollector<'a, 'b> { dummy_spanned(file)))), dummy_spanned(ast::NestedMetaItemKind::MetaItem( attr::mk_name_value_item_str(Ident::from_str("contents"), - dummy_spanned(Symbol::intern(&src))))), + dummy_spanned(src_interned)))), ]; let include_ident = Ident::from_str("include"); diff --git a/src/libsyntax/ext/source_util.rs b/src/libsyntax/ext/source_util.rs index d6dce63ea5e4b..669536f519ce3 100644 --- a/src/libsyntax/ext/source_util.rs +++ b/src/libsyntax/ext/source_util.rs @@ -150,11 +150,13 @@ pub fn expand_include_str(cx: &mut ExtCtxt, sp: Span, tts: &[tokenstream::TokenT }; match String::from_utf8(bytes) { Ok(src) => { + let interned_src = Symbol::intern(&src); + // Add this input file to the code map to make it available as // dependency information - cx.codemap().new_filemap_and_lines(&file, &src); + cx.codemap().new_filemap(file.into(), src); - base::MacEager::expr(cx.expr_str(sp, Symbol::intern(&src))) + base::MacEager::expr(cx.expr_str(sp, interned_src)) } Err(_) => { cx.span_err(sp, @@ -182,7 +184,7 @@ pub fn expand_include_bytes(cx: &mut ExtCtxt, sp: Span, tts: &[tokenstream::Toke Ok(..) => { // Add this input file to the code map to make it available as // dependency information, but don't enter it's contents - cx.codemap().new_filemap_and_lines(&file, ""); + cx.codemap().new_filemap(file.into(), "".to_string()); base::MacEager::expr(cx.expr_lit(sp, ast::LitKind::ByteStr(Lrc::new(bytes)))) } diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs index 7da0d816d0f7a..3995a9b8689e7 100644 --- a/src/libsyntax/parse/lexer/comments.rs +++ b/src/libsyntax/parse/lexer/comments.rs @@ -240,9 +240,11 @@ fn read_block_comment(rdr: &mut StringReader, let mut lines: Vec = Vec::new(); // Count the number of chars since the start of the line by rescanning. - let mut src_index = rdr.src_index(rdr.filemap.line_begin_pos()); + let mut src_index = rdr.src_index(rdr.filemap.line_begin_pos(rdr.pos)); let end_src_index = rdr.src_index(rdr.pos); - assert!(src_index <= end_src_index); + assert!(src_index <= end_src_index, + "src_index={}, end_src_index={}, line_begin_pos={}", + src_index, end_src_index, rdr.filemap.line_begin_pos(rdr.pos).to_u32()); let mut n = 0; while src_index < end_src_index { let c = char_at(&rdr.src, src_index); diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index c09cfd910d207..dcc71e7877852 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -51,11 +51,7 @@ pub struct StringReader<'a> { pub ch: Option, pub filemap: Lrc, /// Stop reading src at this index. - end_src_index: usize, - /// Whether to record new-lines and multibyte chars in filemap. - /// This is only necessary the first time a filemap is lexed. - /// If part of a filemap is being re-lexed, this should be set to false. - save_new_lines_and_multibyte: bool, + pub end_src_index: usize, // cached: peek_tok: token::Token, peek_span: Span, @@ -188,7 +184,6 @@ impl<'a> StringReader<'a> { ch: Some('\n'), filemap, end_src_index: src.len(), - save_new_lines_and_multibyte: true, // dummy values; not read peek_tok: token::Eof, peek_span: syntax_pos::DUMMY_SP, @@ -225,7 +220,6 @@ impl<'a> StringReader<'a> { let mut sr = StringReader::new_raw_internal(sess, begin.fm, None); // Seek the lexer to the right byte range. - sr.save_new_lines_and_multibyte = false; sr.next_pos = span.lo(); sr.end_src_index = sr.src_index(span.hi()); @@ -458,18 +452,6 @@ impl<'a> StringReader<'a> { let next_ch = char_at(&self.src, next_src_index); let next_ch_len = next_ch.len_utf8(); - if self.ch.unwrap() == '\n' { - if self.save_new_lines_and_multibyte { - self.filemap.next_line(self.next_pos); - } - } - if next_ch_len > 1 { - if self.save_new_lines_and_multibyte { - self.filemap.record_multibyte_char(self.next_pos, next_ch_len); - } - } - self.filemap.record_width(self.next_pos, next_ch); - self.ch = Some(next_ch); self.pos = self.next_pos; self.next_pos = self.next_pos + Pos::from_usize(next_ch_len); diff --git a/src/libsyntax/test_snippet.rs b/src/libsyntax/test_snippet.rs index 81dcc1998edd1..c7e4fbd1073d7 100644 --- a/src/libsyntax/test_snippet.rs +++ b/src/libsyntax/test_snippet.rs @@ -51,7 +51,7 @@ fn test_harness(file_text: &str, span_labels: Vec, expected_output: & let output = Arc::new(Mutex::new(Vec::new())); let code_map = Lrc::new(CodeMap::new(FilePathMapping::empty())); - code_map.new_filemap_and_lines(Path::new("test.rs"), &file_text); + code_map.new_filemap(Path::new("test.rs").to_owned().into(), file_text.to_owned()); let primary_span = make_span(&file_text, &span_labels[0].start, &span_labels[0].end); let mut msp = MultiSpan::from_span(primary_span); diff --git a/src/libsyntax_pos/Cargo.toml b/src/libsyntax_pos/Cargo.toml index a9147b394f7a4..08ee2e0f37626 100644 --- a/src/libsyntax_pos/Cargo.toml +++ b/src/libsyntax_pos/Cargo.toml @@ -14,3 +14,4 @@ rustc_data_structures = { path = "../librustc_data_structures" } arena = { path = "../libarena" } scoped-tls = { version = "0.1.1", features = ["nightly"] } unicode-width = "0.1.4" +cfg-if = "0.1.2" diff --git a/src/libsyntax_pos/analyze_filemap.rs b/src/libsyntax_pos/analyze_filemap.rs new file mode 100644 index 0000000000000..c7c0263e45932 --- /dev/null +++ b/src/libsyntax_pos/analyze_filemap.rs @@ -0,0 +1,436 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use unicode_width::UnicodeWidthChar; +use super::*; + +/// Find all newlines, multi-byte characters, and non-narrow characters in a +/// FileMap. +/// +/// This function will use an SSE2 enhanced implementation if hardware support +/// is detected at runtime. +pub fn analyze_filemap( + src: &str, + filemap_start_pos: BytePos) + -> (Vec, Vec, Vec) +{ + let mut lines = vec![filemap_start_pos]; + let mut multi_byte_chars = vec![]; + let mut non_narrow_chars = vec![]; + + // Calls the right implementation, depending on hardware support available. + analyze_filemap_dispatch(src, + filemap_start_pos, + &mut lines, + &mut multi_byte_chars, + &mut non_narrow_chars); + + // The code above optimistically registers a new line *after* each \n + // it encounters. If that point is already outside the filemap, remove + // it again. + if let Some(&last_line_start) = lines.last() { + let file_map_end = filemap_start_pos + BytePos::from_usize(src.len()); + assert!(file_map_end >= last_line_start); + if last_line_start == file_map_end { + lines.pop(); + } + } + + (lines, multi_byte_chars, non_narrow_chars) +} + +cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + not(stage0)))] { + fn analyze_filemap_dispatch(src: &str, + filemap_start_pos: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) { + if is_x86_feature_detected!("sse2") { + unsafe { + analyze_filemap_sse2(src, + filemap_start_pos, + lines, + multi_byte_chars, + non_narrow_chars); + } + } else { + analyze_filemap_generic(src, + src.len(), + filemap_start_pos, + lines, + multi_byte_chars, + non_narrow_chars); + + } + } + + /// Check 16 byte chunks of text at a time. If the chunk contains + /// something other than printable ASCII characters and newlines, the + /// function falls back to the generic implementation. Otherwise it uses + /// SSE2 intrinsics to quickly find all newlines. + #[target_feature(enable = "sse2")] + unsafe fn analyze_filemap_sse2(src: &str, + output_offset: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) { + #[cfg(target_arch = "x86")] + use std::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::*; + + const CHUNK_SIZE: usize = 16; + + let src_bytes = src.as_bytes(); + + let chunk_count = src.len() / CHUNK_SIZE; + + // This variable keeps track of where we should start decoding a + // chunk. If a multi-byte character spans across chunk boundaries, + // we need to skip that part in the next chunk because we already + // handled it. + let mut intra_chunk_offset = 0; + + for chunk_index in 0 .. chunk_count { + let ptr = src_bytes.as_ptr() as *const __m128i; + // We don't know if the pointer is aligned to 16 bytes, so we + // use `loadu`, which supports unaligned loading. + let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize)); + + // For character in the chunk, see if its byte value is < 0, which + // indicates that it's part of a UTF-8 char. + let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)); + // Create a bit mask from the comparison results. + let multibyte_mask = _mm_movemask_epi8(multibyte_test); + + // If the bit mask is all zero, we only have ASCII chars here: + if multibyte_mask == 0 { + assert!(intra_chunk_offset == 0); + + // Check if there are any control characters in the chunk. All + // control characters that we can encounter at this point have a + // byte value less than 32 or ... + let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)); + let control_char_mask0 = _mm_movemask_epi8(control_char_test0); + + // ... it's the ASCII 'DEL' character with a value of 127. + let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)); + let control_char_mask1 = _mm_movemask_epi8(control_char_test1); + + let control_char_mask = control_char_mask0 | control_char_mask1; + + if control_char_mask != 0 { + // Check for newlines in the chunk + let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)); + let newlines_mask = _mm_movemask_epi8(newlines_test); + + if control_char_mask == newlines_mask { + // All control characters are newlines, record them + let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; + let output_offset = output_offset + + BytePos::from_usize(chunk_index * CHUNK_SIZE + 1); + + loop { + let index = newlines_mask.trailing_zeros(); + + if index >= CHUNK_SIZE as u32 { + // We have arrived at the end of the chunk. + break + } + + lines.push(BytePos(index) + output_offset); + + // Clear the bit, so we can find the next one. + newlines_mask &= (!1) << index; + } + + // We are done for this chunk. All control characters were + // newlines and we took care of those. + continue + } else { + // Some of the control characters are not newlines, + // fall through to the slow path below. + } + } else { + // No control characters, nothing to record for this chunk + continue + } + } + + // The slow path. + // There are control chars in here, fallback to generic decoding. + let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; + intra_chunk_offset = analyze_filemap_generic( + &src[scan_start .. ], + CHUNK_SIZE - intra_chunk_offset, + BytePos::from_usize(scan_start) + output_offset, + lines, + multi_byte_chars, + non_narrow_chars + ); + } + + // There might still be a tail left to analyze + let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; + if tail_start < src.len() { + analyze_filemap_generic(&src[tail_start as usize ..], + src.len() - tail_start, + output_offset + BytePos::from_usize(tail_start), + lines, + multi_byte_chars, + non_narrow_chars); + } + } + } else { + + // The target (or compiler version) does not support SSE2 ... + fn analyze_filemap_dispatch(src: &str, + filemap_start_pos: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) { + analyze_filemap_generic(src, + src.len(), + filemap_start_pos, + lines, + multi_byte_chars, + non_narrow_chars); + } + } +} + +// `scan_len` determines the number of bytes in `src` to scan. Note that the +// function can read past `scan_len` if a multi-byte character start within the +// range but extends past it. The overflow is returned by the function. +fn analyze_filemap_generic(src: &str, + scan_len: usize, + output_offset: BytePos, + lines: &mut Vec, + multi_byte_chars: &mut Vec, + non_narrow_chars: &mut Vec) + -> usize +{ + assert!(src.len() >= scan_len); + let mut i = 0; + let src_bytes = src.as_bytes(); + + while i < scan_len { + let byte = unsafe { + // We verified that i < scan_len <= src.len() + *src_bytes.get_unchecked(i as usize) + }; + + // How much to advance in order to get to the next UTF-8 char in the + // string. + let mut char_len = 1; + + if byte < 32 { + // This is an ASCII control character, it could be one of the cases + // that are interesting to us. + + let pos = BytePos::from_usize(i) + output_offset; + + match byte { + b'\n' => { + lines.push(pos + BytePos(1)); + } + b'\t' => { + non_narrow_chars.push(NonNarrowChar::Tab(pos)); + } + _ => { + non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos)); + } + } + } else if byte >= 127 { + // The slow path: + // This is either ASCII control character "DEL" or the beginning of + // a multibyte char. Just decode to `char`. + let c = (&src[i..]).chars().next().unwrap(); + char_len = c.len_utf8(); + + let pos = BytePos::from_usize(i) + output_offset; + + if char_len > 1 { + assert!(char_len >=2 && char_len <= 4); + let mbc = MultiByteChar { + pos, + bytes: char_len as u8, + }; + multi_byte_chars.push(mbc); + } + + // Assume control characters are zero width. + // FIXME: How can we decide between `width` and `width_cjk`? + let char_width = UnicodeWidthChar::width(c).unwrap_or(0); + + if char_width != 1 { + non_narrow_chars.push(NonNarrowChar::new(pos, char_width)); + } + } + + i += char_len; + } + + i - scan_len +} + + + +macro_rules! test { + (case: $test_name:ident, + text: $text:expr, + filemap_start_pos: $filemap_start_pos:expr, + lines: $lines:expr, + multi_byte_chars: $multi_byte_chars:expr, + non_narrow_chars: $non_narrow_chars:expr,) => ( + + #[test] + fn $test_name() { + + let (lines, multi_byte_chars, non_narrow_chars) = + analyze_filemap($text, BytePos($filemap_start_pos)); + + let expected_lines: Vec = $lines + .into_iter() + .map(|pos| BytePos(pos)) + .collect(); + + assert_eq!(lines, expected_lines); + + let expected_mbcs: Vec = $multi_byte_chars + .into_iter() + .map(|(pos, bytes)| MultiByteChar { + pos: BytePos(pos), + bytes, + }) + .collect(); + + assert_eq!(multi_byte_chars, expected_mbcs); + + let expected_nncs: Vec = $non_narrow_chars + .into_iter() + .map(|(pos, width)| { + NonNarrowChar::new(BytePos(pos), width) + }) + .collect(); + + assert_eq!(non_narrow_chars, expected_nncs); + }) +} + +test!( + case: empty_text, + text: "", + filemap_start_pos: 0, + lines: vec![], + multi_byte_chars: vec![], + non_narrow_chars: vec![], +); + +test!( + case: newlines_short, + text: "a\nc", + filemap_start_pos: 0, + lines: vec![0, 2], + multi_byte_chars: vec![], + non_narrow_chars: vec![], +); + +test!( + case: newlines_long, + text: "012345678\nabcdef012345678\na", + filemap_start_pos: 0, + lines: vec![0, 10, 26], + multi_byte_chars: vec![], + non_narrow_chars: vec![], +); + +test!( + case: newline_and_multi_byte_char_in_same_chunk, + text: "01234β789\nbcdef0123456789abcdef", + filemap_start_pos: 0, + lines: vec![0, 11], + multi_byte_chars: vec![(5, 2)], + non_narrow_chars: vec![], +); + +test!( + case: newline_and_control_char_in_same_chunk, + text: "01234\u{07}6789\nbcdef0123456789abcdef", + filemap_start_pos: 0, + lines: vec![0, 11], + multi_byte_chars: vec![], + non_narrow_chars: vec![(5, 0)], +); + +test!( + case: multi_byte_char_short, + text: "aβc", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(1, 2)], + non_narrow_chars: vec![], +); + +test!( + case: multi_byte_char_long, + text: "0123456789abcΔf012345β", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(13, 2), (22, 2)], + non_narrow_chars: vec![], +); + +test!( + case: multi_byte_char_across_chunk_boundary, + text: "0123456789abcdeΔ123456789abcdef01234", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(15, 2)], + non_narrow_chars: vec![], +); + +test!( + case: multi_byte_char_across_chunk_boundary_tail, + text: "0123456789abcdeΔ....", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![(15, 2)], + non_narrow_chars: vec![], +); + +test!( + case: non_narrow_short, + text: "0\t2", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![], + non_narrow_chars: vec![(1, 4)], +); + +test!( + case: non_narrow_long, + text: "01\t3456789abcdef01234567\u{07}9", + filemap_start_pos: 0, + lines: vec![0], + multi_byte_chars: vec![], + non_narrow_chars: vec![(2, 4), (24, 0)], +); + +test!( + case: output_offset_all, + text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf", + filemap_start_pos: 1000, + lines: vec![0 + 1000, 7 + 1000, 27 + 1000], + multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)], + non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)], +); diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index 756e0c059a729..55dec31511c37 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -24,6 +24,7 @@ #![feature(optin_builtin_traits)] #![allow(unused_attributes)] #![feature(specialization)] +#![feature(stdsimd)] use std::borrow::Cow; use std::cell::Cell; @@ -47,6 +48,9 @@ use serialize::{Encodable, Decodable, Encoder, Decoder}; extern crate serialize; extern crate serialize as rustc_serialize; // used by deriving +#[macro_use] +extern crate cfg_if; + extern crate unicode_width; pub mod edition; @@ -58,6 +62,8 @@ pub use span_encoding::{Span, DUMMY_SP}; pub mod symbol; +mod analyze_filemap; + pub struct Globals { symbol_interner: Lock, span_interner: Lock, @@ -652,16 +658,16 @@ impl From> for MultiSpan { pub const NO_EXPANSION: SyntaxContext = SyntaxContext::empty(); /// Identifies an offset of a multi-byte character in a FileMap -#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq)] +#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)] pub struct MultiByteChar { /// The absolute offset of the character in the CodeMap pub pos: BytePos, /// The number of bytes, >=2 - pub bytes: usize, + pub bytes: u8, } /// Identifies an offset of a non-narrow character in a FileMap -#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq)] +#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)] pub enum NonNarrowChar { /// Represents a zero-width character ZeroWidth(BytePos), @@ -779,11 +785,11 @@ pub struct FileMap { /// The end position of this source in the CodeMap pub end_pos: BytePos, /// Locations of lines beginnings in the source code - pub lines: Lock>, + pub lines: Vec, /// Locations of multi-byte characters in the source code - pub multibyte_chars: Lock>, + pub multibyte_chars: Vec, /// Width of characters that are not narrow in the source code - pub non_narrow_chars: Lock>, + pub non_narrow_chars: Vec, /// A hash of the filename, used for speeding up the incr. comp. hashing. pub name_hash: u128, } @@ -797,7 +803,7 @@ impl Encodable for FileMap { s.emit_struct_field("start_pos", 4, |s| self.start_pos.encode(s))?; s.emit_struct_field("end_pos", 5, |s| self.end_pos.encode(s))?; s.emit_struct_field("lines", 6, |s| { - let lines = self.lines.borrow(); + let lines = &self.lines[..]; // store the length s.emit_u32(lines.len() as u32)?; @@ -843,10 +849,10 @@ impl Encodable for FileMap { Ok(()) })?; s.emit_struct_field("multibyte_chars", 7, |s| { - (*self.multibyte_chars.borrow()).encode(s) + self.multibyte_chars.encode(s) })?; s.emit_struct_field("non_narrow_chars", 8, |s| { - (*self.non_narrow_chars.borrow()).encode(s) + self.non_narrow_chars.encode(s) })?; s.emit_struct_field("name_hash", 9, |s| { self.name_hash.encode(s) @@ -914,9 +920,9 @@ impl Decodable for FileMap { src: None, src_hash, external_src: Lock::new(ExternalSource::AbsentOk), - lines: Lock::new(lines), - multibyte_chars: Lock::new(multibyte_chars), - non_narrow_chars: Lock::new(non_narrow_chars), + lines, + multibyte_chars, + non_narrow_chars, name_hash, }) }) @@ -949,6 +955,9 @@ impl FileMap { }; let end_pos = start_pos.to_usize() + src.len(); + let (lines, multibyte_chars, non_narrow_chars) = + analyze_filemap::analyze_filemap(&src[..], start_pos); + FileMap { name, name_was_remapped, @@ -959,37 +968,17 @@ impl FileMap { external_src: Lock::new(ExternalSource::Unneeded), start_pos, end_pos: Pos::from_usize(end_pos), - lines: Lock::new(Vec::new()), - multibyte_chars: Lock::new(Vec::new()), - non_narrow_chars: Lock::new(Vec::new()), + lines, + multibyte_chars, + non_narrow_chars, name_hash, } } - /// EFFECT: register a start-of-line offset in the - /// table of line-beginnings. - /// UNCHECKED INVARIANT: these offsets must be added in the right - /// order and must be in the right places; there is shared knowledge - /// about what ends a line between this file and parse.rs - /// WARNING: pos param here is the offset relative to start of CodeMap, - /// and CodeMap will append a newline when adding a filemap without a newline at the end, - /// so the safe way to call this is with value calculated as - /// filemap.start_pos + newline_offset_relative_to_the_start_of_filemap. - pub fn next_line(&self, pos: BytePos) { - // the new charpos must be > the last one (or it's the first one). - let mut lines = self.lines.borrow_mut(); - let line_len = lines.len(); - assert!(line_len == 0 || ((*lines)[line_len - 1] < pos)); - lines.push(pos); - } - /// Return the BytePos of the beginning of the current line. - pub fn line_begin_pos(&self) -> BytePos { - let lines = self.lines.borrow(); - match lines.last() { - Some(&line_pos) => line_pos, - None => self.start_pos, - } + pub fn line_begin_pos(&self, pos: BytePos) -> BytePos { + let line_index = self.lookup_line(pos).unwrap(); + self.lines[line_index] } /// Add externally loaded source. @@ -1040,8 +1029,7 @@ impl FileMap { } let begin = { - let lines = self.lines.borrow(); - let line = if let Some(line) = lines.get(line_number) { + let line = if let Some(line) = self.lines.get(line_number) { line } else { return None; @@ -1059,35 +1047,6 @@ impl FileMap { } } - pub fn record_multibyte_char(&self, pos: BytePos, bytes: usize) { - assert!(bytes >=2 && bytes <= 4); - let mbc = MultiByteChar { - pos, - bytes, - }; - self.multibyte_chars.borrow_mut().push(mbc); - } - - #[inline] - pub fn record_width(&self, pos: BytePos, ch: char) { - let width = match ch { - '\t' => - // Tabs will consume 4 columns. - 4, - '\n' => - // Make newlines take one column so that displayed spans can point them. - 1, - ch => - // Assume control characters are zero width. - // FIXME: How can we decide between `width` and `width_cjk`? - unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0), - }; - // Only record non-narrow characters. - if width != 1 { - self.non_narrow_chars.borrow_mut().push(NonNarrowChar::new(pos, width)); - } - } - pub fn is_real_file(&self) -> bool { self.name.is_real() } @@ -1100,7 +1059,7 @@ impl FileMap { self.end_pos.0 - self.start_pos.0 } pub fn count_lines(&self) -> usize { - self.lines.borrow().len() + self.lines.len() } /// Find the line containing the given position. The return value is the @@ -1108,13 +1067,12 @@ impl FileMap { /// number. If the filemap is empty or the position is located before the /// first line, None is returned. pub fn lookup_line(&self, pos: BytePos) -> Option { - let lines = self.lines.borrow(); - if lines.len() == 0 { + if self.lines.len() == 0 { return None; } - let line_index = lookup_line(&lines[..], pos); - assert!(line_index < lines.len() as isize); + let line_index = lookup_line(&self.lines[..], pos); + assert!(line_index < self.lines.len() as isize); if line_index >= 0 { Some(line_index as usize) } else { @@ -1127,12 +1085,11 @@ impl FileMap { return (self.start_pos, self.end_pos); } - let lines = self.lines.borrow(); - assert!(line_index < lines.len()); - if line_index == (lines.len() - 1) { - (lines[line_index], self.end_pos) + assert!(line_index < self.lines.len()); + if line_index == (self.lines.len() - 1) { + (self.lines[line_index], self.end_pos) } else { - (lines[line_index], lines[line_index + 1]) + (self.lines[line_index], self.lines[line_index + 1]) } } @@ -1156,6 +1113,8 @@ fn remove_bom(src: &mut String) { pub trait Pos { fn from_usize(n: usize) -> Self; fn to_usize(&self) -> usize; + fn from_u32(n: u32) -> Self; + fn to_u32(&self) -> u32; } /// A byte offset. Keep this small (currently 32-bits), as AST contains @@ -1177,7 +1136,13 @@ impl Pos for BytePos { fn from_usize(n: usize) -> BytePos { BytePos(n as u32) } #[inline(always)] - fn to_usize(&self) -> usize { let BytePos(n) = *self; n as usize } + fn to_usize(&self) -> usize { self.0 as usize } + + #[inline(always)] + fn from_u32(n: u32) -> BytePos { BytePos(n) } + + #[inline(always)] + fn to_u32(&self) -> u32 { self.0 } } impl Add for BytePos { @@ -1215,7 +1180,13 @@ impl Pos for CharPos { fn from_usize(n: usize) -> CharPos { CharPos(n) } #[inline(always)] - fn to_usize(&self) -> usize { let CharPos(n) = *self; n } + fn to_usize(&self) -> usize { self.0 } + + #[inline(always)] + fn from_u32(n: u32) -> CharPos { CharPos(n as usize) } + + #[inline(always)] + fn to_u32(&self) -> u32 { self.0 as u32} } impl Add for CharPos {