Improve autovectorization of to_lowercase / to_uppercase functions

jhorstmann · jhorstmann · commit 2be10833fcde · 2024-04-11T10:02:40.000+02:00
Refactor the code in the `convert_while_ascii` helper function to make it more suitable for auto-vectorization and also process the full ascii prefix of the string. The generic case conversion logic will only be invoked starting from the first non-ascii character. The runtime on a microbenchmark with a small ascii-only input decreases from ~55ns to ~18ns per iteration. The new implementation also reduces the amount of unsafe code and encapsulates all unsafe inside the helper function. Fixes #123712
diff --git a/library/alloc/benches/string.rs b/library/alloc/benches/string.rs
@@ -162,3 +162,10 @@ fn bench_insert_str_long(b: &mut Bencher) {
         x
     })
 }
+
+#[bench]
+fn bench_to_lowercase(b: &mut Bencher) {
+    let s = "Hello there, the quick brown fox jumped over the lazy dog! \
+              Lorem ipsum dolor sit amet, consectetur. ";
+    b.iter(|| s.to_lowercase())
+}
diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs
@@ -10,6 +10,7 @@
 use core::borrow::{Borrow, BorrowMut};
 use core::iter::FusedIterator;
 use core::mem;
+use core::mem::MaybeUninit;
 use core::ptr;
 use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher};
 use core::unicode::conversions;
@@ -366,14 +367,7 @@ impl str {
                   without modifying the original"]
     #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
     pub fn to_lowercase(&self) -> String {
-        let out = convert_while_ascii(self.as_bytes(), u8::to_ascii_lowercase);
-
-        // Safety: we know this is a valid char boundary since
-        // out.len() is only progressed if ascii bytes are found
-        let rest = unsafe { self.get_unchecked(out.len()..) };
-
-        // Safety: We have written only valid ASCII to our vec
-        let mut s = unsafe { String::from_utf8_unchecked(out) };
+        let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_lowercase);
 
         for (i, c) in rest[..].char_indices() {
             if c == 'Σ' {
@@ -457,14 +451,7 @@ impl str {
                   without modifying the original"]
     #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
     pub fn to_uppercase(&self) -> String {
-        let out = convert_while_ascii(self.as_bytes(), u8::to_ascii_uppercase);
-
-        // Safety: we know this is a valid char boundary since
-        // out.len() is only progressed if ascii bytes are found
-        let rest = unsafe { self.get_unchecked(out.len()..) };
-
-        // Safety: We have written only valid ASCII to our vec
-        let mut s = unsafe { String::from_utf8_unchecked(out) };
+        let (mut s, rest) = convert_while_ascii(self, u8::to_ascii_uppercase);
 
         for c in rest.chars() {
             match conversions::to_upper(c) {
@@ -613,50 +600,74 @@ pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box<str> {
     unsafe { Box::from_raw(Box::into_raw(v) as *mut str) }
 }
 
-/// Converts the bytes while the bytes are still ascii.
+/// Converts leading ascii bytes in `s` by calling the `convert` function.
+///
 /// For better average performance, this happens in chunks of `2*size_of::<usize>()`.
-/// Returns a vec with the converted bytes.
+///
+/// Returns a tuple of the converted prefix and the remainder starting from
+/// the first non-ascii character.
 #[inline]
 #[cfg(not(test))]
 #[cfg(not(no_global_oom_handling))]
-fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8) -> Vec<u8> {
-    let mut out = Vec::with_capacity(b.len());
-
+fn convert_while_ascii(s: &str, convert: fn(&u8) -> u8) -> (String, &str) {
     const USIZE_SIZE: usize = mem::size_of::<usize>();
     const MAGIC_UNROLL: usize = 2;
     const N: usize = USIZE_SIZE * MAGIC_UNROLL;
-    const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; USIZE_SIZE]);
 
-    let mut i = 0;
-    unsafe {
-        while i + N <= b.len() {
-            // Safety: we have checks the sizes `b` and `out` to know that our
-            let in_chunk = b.get_unchecked(i..i + N);
-            let out_chunk = out.spare_capacity_mut().get_unchecked_mut(i..i + N);
-
-            let mut bits = 0;
-            for j in 0..MAGIC_UNROLL {
-                // read the bytes 1 usize at a time (unaligned since we haven't checked the alignment)
-                // safety: in_chunk is valid bytes in the range
-                bits |= in_chunk.as_ptr().cast::<usize>().add(j).read_unaligned();
-            }
-            // if our chunks aren't ascii, then return only the prior bytes as init
-            if bits & NONASCII_MASK != 0 {
-                break;
-            }
+    let mut slice = s.as_bytes();
+    let mut out = Vec::with_capacity(slice.len());
+    let mut out_slice = out.spare_capacity_mut();
 
-            // perform the case conversions on N bytes (gets heavily autovec'd)
-            for j in 0..N {
-                // safety: in_chunk and out_chunk is valid bytes in the range
-                let out = out_chunk.get_unchecked_mut(j);
-                out.write(convert(in_chunk.get_unchecked(j)));
-            }
+    let mut i = 0_usize;
 
-            // mark these bytes as initialised
-            i += N;
+    // process the input in chunks to enable auto-vectorization
+    while slice.len() >= N {
+        let chunk = &slice[..N];
+        let mut is_ascii = [false; N];
+
+        for j in 0..N {
+            is_ascii[j] = chunk[j] <= 127;
         }
-        out.set_len(i);
+
+        // auto-vectorization for this check is a bit fragile,
+        // sum and comparing against the chunk size gives the best result,
+        // specifically a pmovmsk instruction on x86.
+        if is_ascii.into_iter().map(|x| x as u8).sum::<u8>() as usize != N {
+            break;
+        }
+
+        for j in 0..N {
+            out_slice[j] = MaybeUninit::new(convert(&chunk[j]));
+        }
+
+        i += N;
+        slice = &slice[N..];
+        out_slice = &mut out_slice[N..];
+    }
+
+    // handle the remainder as individual bytes
+    while !slice.is_empty() {
+        let byte = slice[0];
+        if byte > 127 {
+            break;
+        }
+        out_slice[0] = MaybeUninit::new(convert(&byte));
+        i += 1;
+        slice = &slice[1..];
+        out_slice = &mut out_slice[1..];
     }
 
-    out
+    unsafe {
+        // SAFETY: i bytes have been initialized above
+        out.set_len(i);
+
+        // SAFETY: We have written only valid ascii to the output vec
+        let ascii_string = String::from_utf8_unchecked(out);
+
+        // SAFETY: we know this is a valid char boundary
+        // since we only skipped over leading ascii bytes
+        let rest = core::str::from_utf8_unchecked(slice);
+
+        (ascii_string, rest)
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -162,3 +162,10 @@ fn bench_insert_str_long(b: &mut Bencher) {`
`162`	`162`	`x`
`163`	`163`	`})`
`164`	`164`	`}`
	`165`	`+`
	`166`	`+#[bench]`
	`167`	`+fn bench_to_lowercase(b: &mut Bencher) {`
	`168`	`+ let s = "Hello there, the quick brown fox jumped over the lazy dog! \`
	`169`	`+ Lorem ipsum dolor sit amet, consectetur. ";`
	`170`	`+ b.iter(\|\| s.to_lowercase())`
	`171`	`+}`