Skip to content

Optimized HashMap for size. Added DefaultResizePolicy #14526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 5, 2014
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 117 additions & 72 deletions src/libcollections/hashmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ mod table {
/// There's currently no "debug-only" asserts in rust, so if you're reading
/// this and going "what? of course there are debug-only asserts!", then
/// please make this use them!
#[unsafe_no_drop_flag]
pub struct RawTable<K, V> {
capacity: uint,
size: uint,
Expand Down Expand Up @@ -549,38 +550,59 @@ mod table {

assert_eq!(self.size, 0);

let hashes_size = self.capacity * size_of::<u64>();
let keys_size = self.capacity * size_of::<K>();
let vals_size = self.capacity * size_of::<V>();
let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::<u64>(),
keys_size, min_align_of::<K>(),
vals_size, min_align_of::<V>());
if self.hashes.is_not_null() {
let hashes_size = self.capacity * size_of::<u64>();
let keys_size = self.capacity * size_of::<K>();
let vals_size = self.capacity * size_of::<V>();
let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::<u64>(),
keys_size, min_align_of::<K>(),
vals_size, min_align_of::<V>());

unsafe {
deallocate(self.hashes as *mut u8, size, align);
// Remember how everything was allocated out of one buffer
// during initialization? We only need one call to free here.
}

unsafe {
deallocate(self.hashes as *mut u8, size, align);
// Remember how everything was allocated out of one buffer
// during initialization? We only need one call to free here.
self.hashes = RawPtr::null();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't necessary, dropping will always zero the destination.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eddyb: @huonw advised to add this "just to be safe". Dropping should zero it out.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eddyb drop zeroing is not to be relied upon. it's going away.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cmr and when it does, destructors will be run precisely once. (Meaning this is ok.)

}
}
}
}

// We use this type for the load factor, to avoid floating point operations
// which might not be supported efficiently on some hardware.
//
// We use small u16s here to save space in the hashtable. They get upcasted
// to u64s when we actually use them.
type Fraction = (u16, u16); // (numerator, denominator)

// multiplication by a fraction, in a way that won't generally overflow for
// array sizes outside a factor of 10 of U64_MAX.
fn fraction_mul(lhs: uint, (num, den): Fraction) -> uint {
(((lhs as u64) * (num as u64)) / (den as u64)) as uint
}

static INITIAL_LOG2_CAP: uint = 5;
static INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5
static INITIAL_LOAD_FACTOR: Fraction = (9, 10);

/// The default behavior of HashMap implements a load factor of 90.9%.
/// This behavior is characterized by the following conditions:
///
/// - if `size * 1.1 < cap < size * 4` then shouldn't resize
/// - if `cap < minimum_capacity * 2` then shouldn't shrink
#[deriving(Clone)]
struct DefaultResizePolicy {
/// Doubled minimal capacity. The capacity must never drop below
/// the minimum capacity. (The check happens before the capacity
/// is potentially halved.)
minimum_capacity2: uint
}

impl DefaultResizePolicy {
fn new(new_capacity: uint) -> DefaultResizePolicy {
DefaultResizePolicy {
minimum_capacity2: new_capacity << 1
}
}

#[inline]
fn capacity_range(&self, new_size: uint) -> (uint, uint) {
((new_size * 11) / 10, max(new_size << 3, self.minimum_capacity2))
}

#[inline]
fn reserve(&mut self, new_capacity: uint) {
self.minimum_capacity2 = new_capacity << 1;
}
}

// The main performance trick in this hashmap is called Robin Hood Hashing.
// It gains its excellent performance from one key invariant:
Expand All @@ -593,13 +615,13 @@ static INITIAL_LOAD_FACTOR: Fraction = (9, 10);
// high load factors with good performance. The 90% load factor I use is rather
// conservative.
//
// > Why a load factor of 90%?
// > Why a load factor of approximately 90%?
//
// In general, all the distances to initial buckets will converge on the mean.
// At a load factor of α, the odds of finding the target bucket after k
// probes is approximately 1-α^k. If we set this equal to 50% (since we converge
// on the mean) and set k=8 (64-byte cache line / 8-byte hash), α=0.92. I round
// this down to 0.90 to make the math easier on the CPU and avoid its FPU.
// this down to make the math easier on the CPU and avoid its FPU.
// Since on average we start the probing in the middle of a cache line, this
// strategy pulls in two cache lines of hashes on every lookup. I think that's
// pretty good, but if you want to trade off some space, it could go down to one
Expand All @@ -616,8 +638,6 @@ static INITIAL_LOAD_FACTOR: Fraction = (9, 10);
// ============================
//
// Allow the load factor to be changed dynamically and/or at initialization.
// I'm having trouble figuring out a sane API for this without exporting my
// hackish fraction type, while still avoiding floating point.
//
// Also, would it be possible for us to reuse storage when growing the
// underlying table? This is exactly the use case for 'realloc', and may
Expand Down Expand Up @@ -715,31 +735,13 @@ pub struct HashMap<K, V, H = sip::SipHasher> {
// All hashes are keyed on these values, to prevent hash collision attacks.
hasher: H,

// When size == grow_at, we double the capacity.
grow_at: uint,

// The capacity must never drop below this.
minimum_capacity: uint,

table: table::RawTable<K, V>,

// We keep this at the end since it's 4-bytes, unlike everything else
// in this struct. Might as well save a word of padding!
load_factor: Fraction,
}

/// Get the number of elements which will force the capacity to grow.
fn grow_at(capacity: uint, load_factor: Fraction) -> uint {
fraction_mul(capacity, load_factor)
// We keep this at the end since it might as well have tail padding.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not the best justification

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My original struct ordering "optimizations" may not have been valid for rust. I believe (though would love for someone more knowledgeable to chime in here) that rust reserves the right to reorder structs, so this optimization will be done by the compiler.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cgaebel, you are right, padding doesn't exactly work like you expected.

An RFC about struct reordering has led to padding measurements and in turn resulted in this PR since HashMap had 11 bytes of padding.

Also, this RFC was recently accepted, it's going to be implemented in #14479.

resize_policy: DefaultResizePolicy,
}

impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
/// Get the number of elements which will force the capacity to shrink.
/// When size == self.shrink_at(), we halve the capacity.
fn shrink_at(&self) -> uint {
self.table.capacity() >> 2
}

// Probe the `idx`th bucket for a given hash, returning the index of the
// target bucket.
//
Expand Down Expand Up @@ -931,9 +933,12 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Container for HashMap<K, V, H> {
}

impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Mutable for HashMap<K, V, H> {
/// Clear the map, removing all key-value pairs.
/// Clear the map, removing all key-value pairs. Keeps the allocated memory
/// for reuse.
fn clear(&mut self) {
self.minimum_capacity = self.table.size();
// Prevent reallocations from happening from now on. Makes it possible
// for the map to be reused but has a downside: reserves permanently.
self.resize_policy.reserve(self.table.size());

for i in range(0, self.table.capacity()) {
match self.table.peek(i) {
Expand All @@ -944,7 +949,6 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Mutable for HashMap<K, V, H> {
}
}


impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> Map<K, V> for HashMap<K, V, H> {
fn find<'a>(&'a self, k: &K) -> Option<&'a V> {
self.search(k).map(|idx| {
Expand Down Expand Up @@ -1057,11 +1061,9 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
pub fn with_capacity_and_hasher(capacity: uint, hasher: H) -> HashMap<K, V, H> {
let cap = num::next_power_of_two(max(INITIAL_CAPACITY, capacity));
HashMap {
hasher: hasher,
load_factor: INITIAL_LOAD_FACTOR,
grow_at: grow_at(cap, INITIAL_LOAD_FACTOR),
minimum_capacity: cap,
table: table::RawTable::new(cap),
hasher: hasher,
resize_policy: DefaultResizePolicy::new(cap),
table: table::RawTable::new(cap),
}
}

Expand All @@ -1075,7 +1077,7 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
let cap = num::next_power_of_two(
max(INITIAL_CAPACITY, new_minimum_capacity));

self.minimum_capacity = cap;
self.resize_policy.reserve(cap);

if self.table.capacity() < cap {
self.resize(cap);
Expand All @@ -1090,8 +1092,6 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
assert!(self.table.size() <= new_capacity);
assert!(num::is_power_of_two(new_capacity));

self.grow_at = grow_at(new_capacity, self.load_factor);

let old_table = replace(&mut self.table, table::RawTable::new(new_capacity));
let old_size = old_table.size();

Expand All @@ -1105,19 +1105,18 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
/// Performs any necessary resize operations, such that there's space for
/// new_size elements.
fn make_some_room(&mut self, new_size: uint) {
let should_shrink = new_size <= self.shrink_at();
let should_grow = self.grow_at <= new_size;
let (grow_at, shrink_at) = self.resize_policy.capacity_range(new_size);
let cap = self.table.capacity();

if should_grow {
let new_capacity = self.table.capacity() << 1;
self.resize(new_capacity);
} else if should_shrink {
let new_capacity = self.table.capacity() >> 1;
// An invalid value shouldn't make us run out of space.
debug_assert!(grow_at >= new_size);

// Never shrink below the minimum capacity
if self.minimum_capacity <= new_capacity {
self.resize(new_capacity);
}
if cap <= grow_at {
let new_capacity = cap << 1;
self.resize(new_capacity);
} else if shrink_at <= cap {
let new_capacity = cap >> 1;
self.resize(new_capacity);
}
}

Expand Down Expand Up @@ -2025,8 +2024,8 @@ mod test_map {
assert!(m.is_empty());

let mut i = 0u;
let old_resize_at = m.grow_at;
while old_resize_at == m.grow_at {
let old_cap = m.table.capacity();
while old_cap == m.table.capacity() {
m.insert(i, i);
i += 1;
}
Expand All @@ -2035,6 +2034,52 @@ mod test_map {
assert!(!m.is_empty());
}

#[test]
fn test_resize_policy() {
let mut m = HashMap::new();

assert_eq!(m.len(), 0);
assert!(m.is_empty());

let initial_cap = m.table.capacity();
m.reserve(initial_cap * 2);
let cap = m.table.capacity();

assert_eq!(cap, initial_cap * 2);

let mut i = 0u;
for _ in range(0, cap * 3 / 4) {
m.insert(i, i);
i += 1;
}

assert_eq!(m.len(), i);
assert_eq!(m.table.capacity(), cap);

for _ in range(0, cap / 4) {
m.insert(i, i);
i += 1;
}

let new_cap = m.table.capacity();
assert_eq!(new_cap, cap * 2);

for _ in range(0, cap / 2) {
i -= 1;
m.remove(&i);
assert_eq!(m.table.capacity(), new_cap);
}

for _ in range(0, cap / 2 - 1) {
i -= 1;
m.remove(&i);
}

assert_eq!(m.table.capacity(), cap);
assert_eq!(m.len(), i);
assert!(!m.is_empty());
}

#[test]
fn test_find_equiv() {
let mut m = HashMap::new();
Expand Down