Skip to content

Commit 1fd2d77

Browse files
committed
auto merge of #12029 : zkamsler/rust/merge-sort-allocations, r=huonw
This pull request: 1) Changes the initial insertion sort to be in-place, and defers allocation of working set until merge is needed. 2) Increases the increases the maximum run length to use insertion sort for from 8 to 32 elements. This increases the size of vectors that will not allocate, and reduces the number of merge passes by two. It seemed to be the sweet spot in the benchmarks that I ran. Here are the results of some benchmarks. Note that they are sorting u64s, so types that are more expensive to compare or copy may have different behaviors. Before changes: ``` test vec::bench::sort_random_large bench: 719753 ns/iter (+/- 130173) = 111 MB/s test vec::bench::sort_random_medium bench: 4726 ns/iter (+/- 742) = 169 MB/s test vec::bench::sort_random_small bench: 344 ns/iter (+/- 76) = 116 MB/s test vec::bench::sort_sorted bench: 437244 ns/iter (+/- 70043) = 182 MB/s ``` Deferred allocation (8 element insertion sort): ``` test vec::bench::sort_random_large bench: 702630 ns/iter (+/- 88158) = 113 MB/s test vec::bench::sort_random_medium bench: 4529 ns/iter (+/- 497) = 176 MB/s test vec::bench::sort_random_small bench: 185 ns/iter (+/- 49) = 216 MB/s test vec::bench::sort_sorted bench: 425853 ns/iter (+/- 60907) = 187 MB/s ``` Deferred allocation (16 element insertion sort): ``` test vec::bench::sort_random_large bench: 692783 ns/iter (+/- 165837) = 115 MB/s test vec::bench::sort_random_medium bench: 4434 ns/iter (+/- 722) = 180 MB/s test vec::bench::sort_random_small bench: 187 ns/iter (+/- 38) = 213 MB/s test vec::bench::sort_sorted bench: 393783 ns/iter (+/- 85548) = 203 MB/s ``` Deferred allocation (32 element insertion sort): ``` test vec::bench::sort_random_large bench: 682556 ns/iter (+/- 131008) = 117 MB/s test vec::bench::sort_random_medium bench: 4370 ns/iter (+/- 1369) = 183 MB/s test vec::bench::sort_random_small bench: 179 ns/iter (+/- 32) = 223 MB/s test vec::bench::sort_sorted bench: 358353 ns/iter (+/- 65423) = 223 MB/s ``` Deferred allocation (64 element insertion sort): ``` test vec::bench::sort_random_large bench: 712040 ns/iter (+/- 132454) = 112 MB/s test vec::bench::sort_random_medium bench: 4425 ns/iter (+/- 784) = 180 MB/s test vec::bench::sort_random_small bench: 179 ns/iter (+/- 81) = 223 MB/s test vec::bench::sort_sorted bench: 317812 ns/iter (+/- 62675) = 251 MB/s ``` This is the best I could manage with the basic merge sort while keeping the invariant that the original vector must contain each element exactly once when the comparison function is called. If one is not married to a stable sort, an in-place n*log(n) sorting algorithm may have better performance in some cases. for #12011 cc @huonw
2 parents 7d7a060 + cebe5e8 commit 1fd2d77

File tree

1 file changed

+104
-5
lines changed

1 file changed

+104
-5
lines changed

src/libstd/vec.rs

+104-5
Original file line numberDiff line numberDiff line change
@@ -1812,12 +1812,70 @@ impl<T:Eq> OwnedEqVector<T> for ~[T] {
18121812
}
18131813
}
18141814

1815+
fn insertion_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
1816+
let len = v.len() as int;
1817+
let buf_v = v.as_mut_ptr();
1818+
1819+
// 1 <= i < len;
1820+
for i in range(1, len) {
1821+
// j satisfies: 0 <= j <= i;
1822+
let mut j = i;
1823+
unsafe {
1824+
// `i` is in bounds.
1825+
let read_ptr = buf_v.offset(i) as *T;
1826+
1827+
// find where to insert, we need to do strict <,
1828+
// rather than <=, to maintain stability.
1829+
1830+
// 0 <= j - 1 < len, so .offset(j - 1) is in bounds.
1831+
while j > 0 &&
1832+
compare(&*read_ptr, &*buf_v.offset(j - 1)) == Less {
1833+
j -= 1;
1834+
}
1835+
1836+
// shift everything to the right, to make space to
1837+
// insert this value.
1838+
1839+
// j + 1 could be `len` (for the last `i`), but in
1840+
// that case, `i == j` so we don't copy. The
1841+
// `.offset(j)` is always in bounds.
1842+
1843+
if i != j {
1844+
let tmp = ptr::read_ptr(read_ptr);
1845+
ptr::copy_memory(buf_v.offset(j + 1),
1846+
buf_v.offset(j),
1847+
(i - j) as uint);
1848+
ptr::copy_nonoverlapping_memory(buf_v.offset(j),
1849+
&tmp as *T,
1850+
1);
1851+
cast::forget(tmp);
1852+
}
1853+
}
1854+
}
1855+
}
1856+
18151857
fn merge_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
18161858
// warning: this wildly uses unsafe.
1817-
static INSERTION: uint = 8;
1859+
static BASE_INSERTION: uint = 32;
1860+
static LARGE_INSERTION: uint = 16;
1861+
1862+
// FIXME #12092: smaller insertion runs seems to make sorting
1863+
// vectors of large elements a little faster on some platforms,
1864+
// but hasn't been tested/tuned extensively
1865+
let insertion = if size_of::<T>() <= 16 {
1866+
BASE_INSERTION
1867+
} else {
1868+
LARGE_INSERTION
1869+
};
18181870

18191871
let len = v.len();
18201872

1873+
// short vectors get sorted in-place via insertion sort to avoid allocations
1874+
if len <= insertion {
1875+
insertion_sort(v, compare);
1876+
return;
1877+
}
1878+
18211879
// allocate some memory to use as scratch memory, we keep the
18221880
// length 0 so we can keep shallow copies of the contents of `v`
18231881
// without risking the dtors running on an object twice if
@@ -1837,9 +1895,9 @@ fn merge_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
18371895
// We could hardcode the sorting comparisons here, and we could
18381896
// manipulate/step the pointers themselves, rather than repeatedly
18391897
// .offset-ing.
1840-
for start in range_step(0, len, INSERTION) {
1841-
// start <= i <= len;
1842-
for i in range(start, cmp::min(start + INSERTION, len)) {
1898+
for start in range_step(0, len, insertion) {
1899+
// start <= i < len;
1900+
for i in range(start, cmp::min(start + insertion, len)) {
18431901
// j satisfies: start <= j <= i;
18441902
let mut j = i as int;
18451903
unsafe {
@@ -1871,7 +1929,7 @@ fn merge_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
18711929
}
18721930

18731931
// step 2. merge the sorted runs.
1874-
let mut width = INSERTION;
1932+
let mut width = insertion;
18751933
while width < len {
18761934
// merge the sorted runs of length `width` in `buf_dat` two at
18771935
// a time, placing the result in `buf_tmp`.
@@ -4505,4 +4563,45 @@ mod bench {
45054563
});
45064564
bh.bytes = (v.len() * mem::size_of_val(&v[0])) as u64;
45074565
}
4566+
4567+
type BigSortable = (u64,u64,u64,u64);
4568+
4569+
#[bench]
4570+
fn sort_big_random_small(bh: &mut BenchHarness) {
4571+
let mut rng = weak_rng();
4572+
bh.iter(|| {
4573+
let mut v: ~[BigSortable] = rng.gen_vec(5);
4574+
v.sort();
4575+
});
4576+
bh.bytes = 5 * mem::size_of::<BigSortable>() as u64;
4577+
}
4578+
4579+
#[bench]
4580+
fn sort_big_random_medium(bh: &mut BenchHarness) {
4581+
let mut rng = weak_rng();
4582+
bh.iter(|| {
4583+
let mut v: ~[BigSortable] = rng.gen_vec(100);
4584+
v.sort();
4585+
});
4586+
bh.bytes = 100 * mem::size_of::<BigSortable>() as u64;
4587+
}
4588+
4589+
#[bench]
4590+
fn sort_big_random_large(bh: &mut BenchHarness) {
4591+
let mut rng = weak_rng();
4592+
bh.iter(|| {
4593+
let mut v: ~[BigSortable] = rng.gen_vec(10000);
4594+
v.sort();
4595+
});
4596+
bh.bytes = 10000 * mem::size_of::<BigSortable>() as u64;
4597+
}
4598+
4599+
#[bench]
4600+
fn sort_big_sorted(bh: &mut BenchHarness) {
4601+
let mut v = vec::from_fn(10000u, |i| (i, i, i, i));
4602+
bh.iter(|| {
4603+
v.sort();
4604+
});
4605+
bh.bytes = (v.len() * mem::size_of_val(&v[0])) as u64;
4606+
}
45084607
}

0 commit comments

Comments
 (0)