Skip to content

Commit d70a9b9

Browse files
committed
auto merge of #14526 : pczarn/rust/hashmap-opt, r=alexcrichton
An interface that gives a better control over the load factor and the minimum capacity for HashMap. The size of `HashMap<K, V>` is now 64 bytes by default on a 64-bit platform (or at least 40 bytes, that is 3 words less, with FNV and without minimum capacity) Unanswered questions about `ResizePolicy` * should it control the `INITIAL_CAPACITY`? * should it fully control the resizing behavior? Even though the capacity always changes by a factor of 2. * is caching `grow_at` desirable? special thanks to @eddyb and @pnkfelix
2 parents 57e7147 + 2202b10 commit d70a9b9

File tree

1 file changed

+117
-72
lines changed

1 file changed

+117
-72
lines changed

src/libcollections/hashmap.rs

Lines changed: 117 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ mod table {
101101
/// There's currently no "debug-only" asserts in rust, so if you're reading
102102
/// this and going "what? of course there are debug-only asserts!", then
103103
/// please make this use them!
104+
#[unsafe_no_drop_flag]
104105
pub struct RawTable<K, V> {
105106
capacity: uint,
106107
size: uint,
@@ -549,38 +550,59 @@ mod table {
549550

550551
assert_eq!(self.size, 0);
551552

552-
let hashes_size = self.capacity * size_of::<u64>();
553-
let keys_size = self.capacity * size_of::<K>();
554-
let vals_size = self.capacity * size_of::<V>();
555-
let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::<u64>(),
556-
keys_size, min_align_of::<K>(),
557-
vals_size, min_align_of::<V>());
553+
if self.hashes.is_not_null() {
554+
let hashes_size = self.capacity * size_of::<u64>();
555+
let keys_size = self.capacity * size_of::<K>();
556+
let vals_size = self.capacity * size_of::<V>();
557+
let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::<u64>(),
558+
keys_size, min_align_of::<K>(),
559+
vals_size, min_align_of::<V>());
560+
561+
unsafe {
562+
deallocate(self.hashes as *mut u8, size, align);
563+
// Remember how everything was allocated out of one buffer
564+
// during initialization? We only need one call to free here.
565+
}
558566

559-
unsafe {
560-
deallocate(self.hashes as *mut u8, size, align);
561-
// Remember how everything was allocated out of one buffer
562-
// during initialization? We only need one call to free here.
567+
self.hashes = RawPtr::null();
563568
}
564569
}
565570
}
566571
}
567572

568-
// We use this type for the load factor, to avoid floating point operations
569-
// which might not be supported efficiently on some hardware.
570-
//
571-
// We use small u16s here to save space in the hashtable. They get upcasted
572-
// to u64s when we actually use them.
573-
type Fraction = (u16, u16); // (numerator, denominator)
574-
575-
// multiplication by a fraction, in a way that won't generally overflow for
576-
// array sizes outside a factor of 10 of U64_MAX.
577-
fn fraction_mul(lhs: uint, (num, den): Fraction) -> uint {
578-
(((lhs as u64) * (num as u64)) / (den as u64)) as uint
579-
}
580-
581573
static INITIAL_LOG2_CAP: uint = 5;
582574
static INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5
583-
static INITIAL_LOAD_FACTOR: Fraction = (9, 10);
575+
576+
/// The default behavior of HashMap implements a load factor of 90.9%.
577+
/// This behavior is characterized by the following conditions:
578+
///
579+
/// - if `size * 1.1 < cap < size * 4` then shouldn't resize
580+
/// - if `cap < minimum_capacity * 2` then shouldn't shrink
581+
#[deriving(Clone)]
582+
struct DefaultResizePolicy {
583+
/// Doubled minimal capacity. The capacity must never drop below
584+
/// the minimum capacity. (The check happens before the capacity
585+
/// is potentially halved.)
586+
minimum_capacity2: uint
587+
}
588+
589+
impl DefaultResizePolicy {
590+
fn new(new_capacity: uint) -> DefaultResizePolicy {
591+
DefaultResizePolicy {
592+
minimum_capacity2: new_capacity << 1
593+
}
594+
}
595+
596+
#[inline]
597+
fn capacity_range(&self, new_size: uint) -> (uint, uint) {
598+
((new_size * 11) / 10, max(new_size << 3, self.minimum_capacity2))
599+
}
600+
601+
#[inline]
602+
fn reserve(&mut self, new_capacity: uint) {
603+
self.minimum_capacity2 = new_capacity << 1;
604+
}
605+
}
584606

585607
// The main performance trick in this hashmap is called Robin Hood Hashing.
586608
// It gains its excellent performance from one key invariant:
@@ -593,13 +615,13 @@ static INITIAL_LOAD_FACTOR: Fraction = (9, 10);
593615
// high load factors with good performance. The 90% load factor I use is rather
594616
// conservative.
595617
//
596-
// > Why a load factor of 90%?
618+
// > Why a load factor of approximately 90%?
597619
//
598620
// In general, all the distances to initial buckets will converge on the mean.
599621
// At a load factor of α, the odds of finding the target bucket after k
600622
// probes is approximately 1-α^k. If we set this equal to 50% (since we converge
601623
// on the mean) and set k=8 (64-byte cache line / 8-byte hash), α=0.92. I round
602-
// this down to 0.90 to make the math easier on the CPU and avoid its FPU.
624+
// this down to make the math easier on the CPU and avoid its FPU.
603625
// Since on average we start the probing in the middle of a cache line, this
604626
// strategy pulls in two cache lines of hashes on every lookup. I think that's
605627
// pretty good, but if you want to trade off some space, it could go down to one
@@ -616,8 +638,6 @@ static INITIAL_LOAD_FACTOR: Fraction = (9, 10);
616638
// ============================
617639
//
618640
// Allow the load factor to be changed dynamically and/or at initialization.
619-
// I'm having trouble figuring out a sane API for this without exporting my
620-
// hackish fraction type, while still avoiding floating point.
621641
//
622642
// Also, would it be possible for us to reuse storage when growing the
623643
// underlying table? This is exactly the use case for 'realloc', and may
@@ -715,31 +735,13 @@ pub struct HashMap<K, V, H = sip::SipHasher> {
715735
// All hashes are keyed on these values, to prevent hash collision attacks.
716736
hasher: H,
717737

718-
// When size == grow_at, we double the capacity.
719-
grow_at: uint,
720-
721-
// The capacity must never drop below this.
722-
minimum_capacity: uint,
723-
724738
table: table::RawTable<K, V>,
725739

726-
// We keep this at the end since it's 4-bytes, unlike everything else
727-
// in this struct. Might as well save a word of padding!
728-
load_factor: Fraction,
729-
}
730-
731-
/// Get the number of elements which will force the capacity to grow.
732-
fn grow_at(capacity: uint, load_factor: Fraction) -> uint {
733-
fraction_mul(capacity, load_factor)
740+
// We keep this at the end since it might as well have tail padding.
741+
resize_policy: DefaultResizePolicy,
734742
}
735743

736744
impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
737-
/// Get the number of elements which will force the capacity to shrink.
738-
/// When size == self.shrink_at(), we halve the capacity.
739-
fn shrink_at(&self) -> uint {
740-
self.table.capacity() >> 2
741-
}
742-
743745
// Probe the `idx`th bucket for a given hash, returning the index of the
744746
// target bucket.
745747
//
@@ -931,9 +933,12 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Container for HashMap<K, V, H> {
931933
}
932934

933935
impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Mutable for HashMap<K, V, H> {
934-
/// Clear the map, removing all key-value pairs.
936+
/// Clear the map, removing all key-value pairs. Keeps the allocated memory
937+
/// for reuse.
935938
fn clear(&mut self) {
936-
self.minimum_capacity = self.table.size();
939+
// Prevent reallocations from happening from now on. Makes it possible
940+
// for the map to be reused but has a downside: reserves permanently.
941+
self.resize_policy.reserve(self.table.size());
937942

938943
for i in range(0, self.table.capacity()) {
939944
match self.table.peek(i) {
@@ -944,7 +949,6 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Mutable for HashMap<K, V, H> {
944949
}
945950
}
946951

947-
948952
impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Map<K, V> for HashMap<K, V, H> {
949953
fn find<'a>(&'a self, k: &K) -> Option<&'a V> {
950954
self.search(k).map(|idx| {
@@ -1057,11 +1061,9 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
10571061
pub fn with_capacity_and_hasher(capacity: uint, hasher: H) -> HashMap<K, V, H> {
10581062
let cap = num::next_power_of_two(max(INITIAL_CAPACITY, capacity));
10591063
HashMap {
1060-
hasher: hasher,
1061-
load_factor: INITIAL_LOAD_FACTOR,
1062-
grow_at: grow_at(cap, INITIAL_LOAD_FACTOR),
1063-
minimum_capacity: cap,
1064-
table: table::RawTable::new(cap),
1064+
hasher: hasher,
1065+
resize_policy: DefaultResizePolicy::new(cap),
1066+
table: table::RawTable::new(cap),
10651067
}
10661068
}
10671069

@@ -1075,7 +1077,7 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
10751077
let cap = num::next_power_of_two(
10761078
max(INITIAL_CAPACITY, new_minimum_capacity));
10771079

1078-
self.minimum_capacity = cap;
1080+
self.resize_policy.reserve(cap);
10791081

10801082
if self.table.capacity() < cap {
10811083
self.resize(cap);
@@ -1090,8 +1092,6 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
10901092
assert!(self.table.size() <= new_capacity);
10911093
assert!(num::is_power_of_two(new_capacity));
10921094

1093-
self.grow_at = grow_at(new_capacity, self.load_factor);
1094-
10951095
let old_table = replace(&mut self.table, table::RawTable::new(new_capacity));
10961096
let old_size = old_table.size();
10971097

@@ -1105,19 +1105,18 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
11051105
/// Performs any necessary resize operations, such that there's space for
11061106
/// new_size elements.
11071107
fn make_some_room(&mut self, new_size: uint) {
1108-
let should_shrink = new_size <= self.shrink_at();
1109-
let should_grow = self.grow_at <= new_size;
1108+
let (grow_at, shrink_at) = self.resize_policy.capacity_range(new_size);
1109+
let cap = self.table.capacity();
11101110

1111-
if should_grow {
1112-
let new_capacity = self.table.capacity() << 1;
1113-
self.resize(new_capacity);
1114-
} else if should_shrink {
1115-
let new_capacity = self.table.capacity() >> 1;
1111+
// An invalid value shouldn't make us run out of space.
1112+
debug_assert!(grow_at >= new_size);
11161113

1117-
// Never shrink below the minimum capacity
1118-
if self.minimum_capacity <= new_capacity {
1119-
self.resize(new_capacity);
1120-
}
1114+
if cap <= grow_at {
1115+
let new_capacity = cap << 1;
1116+
self.resize(new_capacity);
1117+
} else if shrink_at <= cap {
1118+
let new_capacity = cap >> 1;
1119+
self.resize(new_capacity);
11211120
}
11221121
}
11231122

@@ -2025,8 +2024,8 @@ mod test_map {
20252024
assert!(m.is_empty());
20262025

20272026
let mut i = 0u;
2028-
let old_resize_at = m.grow_at;
2029-
while old_resize_at == m.grow_at {
2027+
let old_cap = m.table.capacity();
2028+
while old_cap == m.table.capacity() {
20302029
m.insert(i, i);
20312030
i += 1;
20322031
}
@@ -2035,6 +2034,52 @@ mod test_map {
20352034
assert!(!m.is_empty());
20362035
}
20372036

2037+
#[test]
2038+
fn test_resize_policy() {
2039+
let mut m = HashMap::new();
2040+
2041+
assert_eq!(m.len(), 0);
2042+
assert!(m.is_empty());
2043+
2044+
let initial_cap = m.table.capacity();
2045+
m.reserve(initial_cap * 2);
2046+
let cap = m.table.capacity();
2047+
2048+
assert_eq!(cap, initial_cap * 2);
2049+
2050+
let mut i = 0u;
2051+
for _ in range(0, cap * 3 / 4) {
2052+
m.insert(i, i);
2053+
i += 1;
2054+
}
2055+
2056+
assert_eq!(m.len(), i);
2057+
assert_eq!(m.table.capacity(), cap);
2058+
2059+
for _ in range(0, cap / 4) {
2060+
m.insert(i, i);
2061+
i += 1;
2062+
}
2063+
2064+
let new_cap = m.table.capacity();
2065+
assert_eq!(new_cap, cap * 2);
2066+
2067+
for _ in range(0, cap / 2) {
2068+
i -= 1;
2069+
m.remove(&i);
2070+
assert_eq!(m.table.capacity(), new_cap);
2071+
}
2072+
2073+
for _ in range(0, cap / 2 - 1) {
2074+
i -= 1;
2075+
m.remove(&i);
2076+
}
2077+
2078+
assert_eq!(m.table.capacity(), cap);
2079+
assert_eq!(m.len(), i);
2080+
assert!(!m.is_empty());
2081+
}
2082+
20382083
#[test]
20392084
fn test_find_equiv() {
20402085
let mut m = HashMap::new();

0 commit comments

Comments
 (0)