growable_bloom_filter/lib.rs
1#![deny(unsafe_code)]
2#![cfg_attr(feature = "nightly", feature(test))]
3///! Impl of Scalable Bloom Filters
4///! http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.62.7953&rep=rep1&type=pdf
5
6#[cfg(feature = "nightly")]
7extern crate test;
8
9use serde_derive::{Deserialize, Serialize};
10use std::{
11 hash::{Hash, Hasher},
12 iter::Iterator,
13 num::NonZeroU64,
14};
15
16mod stable_hasher;
17
18/// Base Bloom Filter
19#[derive(Deserialize, Serialize, PartialEq, Clone, Debug)]
20struct Bloom {
21 /// The actual bit field. Set to 0 with `Bloom::new`.
22 #[serde(rename = "b", with = "serde_bytes")]
23 buffer: Box<[u8]>,
24 /// The number of slices in the partitioned bloom filter.
25 /// Equivalent to the number of hash function in the classic bloom filter.
26 /// An insertion will result in a bit being set in each slice.
27 #[serde(rename = "k")]
28 num_slices: NonZeroU64,
29}
30
31impl Bloom {
32 /// Create a new Bloom filter (specifically, a Partitioned Bloom filter)
33 ///
34 /// # Arguments
35 ///
36 /// * `capacity` - target capacity. Panics if `capacity` is zero.
37 /// * `error_ratio` - false positive ratio (0..1.0).
38 fn new(capacity: usize, error_ratio: f64) -> Bloom {
39 // Directly from paper:
40 // k = log2(1/P) (num_slices)
41 // n ≈ −m ln(1−p) (slice_len_bits)
42 // M = k * m (total_bits)
43 // for optimal filter p = 0.5, which gives:
44 // n ≈ −m ln(0.5), rearranging: m = -n / ln(0.5) = n / ln(2)
45 debug_assert!(capacity >= 1);
46 debug_assert!(0.0 < error_ratio && error_ratio < 1.0);
47 // We're using ceil instead of round in order to get an error rate <= the desired.
48 // Using round can result in significantly higher error rates.
49 let num_slices = ((1.0 / error_ratio).log2()).ceil() as u64;
50 let slice_len_bits = (capacity as f64 / 2f64.ln()).ceil() as u64;
51 let total_bits = num_slices * slice_len_bits;
52 // round up to the next byte
53 let buffer_bytes = ((total_bits + 7) / 8) as usize;
54
55 let mut buffer = Vec::with_capacity(buffer_bytes);
56 buffer.resize(buffer_bytes, 0);
57 Bloom {
58 buffer: buffer.into_boxed_slice(),
59 num_slices: NonZeroU64::new(num_slices).unwrap(),
60 }
61 }
62
63 /// Create an index iterator for a given item.
64 ///
65 /// This creates an iterator of pairs `(byte, mask)` indices in the buffer.
66 /// The iterator will return one pair of indexes for each slice in the bloom filter.
67 ///
68 /// The pairs `(byte idx, byte mask)` are:
69 /// byte idx: byte idx in `self.buffer` to be extract for usage with the mask
70 /// byte mask: bit mask with a single bit set, can be ANDed (`&`) with
71 /// self.buffer[idx] to yield a number != 0 if the specified bit was set.
72 /// The mask can also be ORed (`|`) with the self.buffer[idx]
73 /// to set the corresponding bit.
74 ///
75 /// # Arguments
76 ///
77 /// * `item` - The item to hash.
78 #[inline]
79 fn index_iterator(&self, mut h1: u64, mut h2: u64) -> impl Iterator<Item = (usize, u8)> {
80 // The _bit_ length (thus buffer.len() multiplied by 8) of each slice within buffer.
81 // We'll use a NonZero type so that the compiler can avoid checking for
82 // division/modulus by 0 inside the iterator.
83 let slice_len = NonZeroU64::new(self.buffer.len() as u64 * 8 / self.num_slices).unwrap();
84
85 // Generate `self.num_slices` hashes from 2 hashes, using enhanced double hashing.
86 // See https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing for details.
87 // We choose to use 2x64 bit hashes instead of 2x32 ones as it gives significant better false positive ratios.
88 debug_assert_ne!(h2, 0, "Second hash can't be 0 for double hashing");
89 (0..self.num_slices.get()).map(move |i| {
90 // Calculate hash(i)
91 let hi = h1 % slice_len + i * slice_len.get();
92 // Advance enhanced double hashing state
93 h1 = h1.wrapping_add(h2);
94 h2 = h2.wrapping_add(i);
95 // Resulting index/mask based on hash(i)
96 let idx = (hi / 8) as usize;
97 let mask = 1u8 << (hi % 8);
98 (idx, mask)
99 })
100 }
101
102 /// Insert an item identified by two hashes is in the Bloom.
103 /// # Arguments
104 ///
105 /// * `h1` - The main hash
106 /// * `h2` - The second hash (must be != 0)
107 ///
108 /// # Example
109 ///
110 ///
111 /// use growable_bloom_filter::Bloom;
112 /// let bloom = Bloom::new(2, 128);
113 ///
114 /// let (h1, h2) = double_hashing_hashes("my-item");
115 /// bloom.insert(h1, h2);
116 ///
117 #[inline]
118 fn insert(&mut self, h1: u64, h2: u64) {
119 // Set all bits (one per slice) corresponding to this item.
120 //
121 // Setting the bit:
122 // 1000 0011 (self.buffer[idx])
123 // 0001 0000 (mask)
124 // |---------
125 // 1001 0011
126 //
127 for (byte, mask) in self.index_iterator(h1, h2) {
128 self.buffer[byte] |= mask;
129 }
130 }
131
132 /// Test if item identified by two hashes is in the Bloom.
133 ///
134 /// # Arguments
135 ///
136 /// * `h1` - The main hash
137 /// * `h2` - The second hash (must be != 0)
138 ///
139 /// # Example
140 ///
141 /// let bloom = Bloom:new(2, 128);
142 ///
143 /// let (h1, h2) = double_hashing_hashes("my-item");
144 /// bloom.insert(h1, h2);
145 ///
146 /// assert!(bloom.contains(h1, h2));
147 ///
148 #[inline]
149 fn contains(&self, h1: u64, h2: u64) -> bool {
150 // Check if all bits (one per slice) corresponding to this item are set.
151 // See index_iterator comments for a detailed explanation.
152 //
153 // Potentially found case:
154 // 0111 1111 (self.buffer[idx])
155 // 0001 0000 (mask)
156 // &---------
157 // 0001 0000 != 0
158 //
159 // Definitely not found case:
160 // 1110 1111 (self.buffer[idx])
161 // 0001 0000 (mask)
162 // &---------
163 // 0000 0000 == 0
164 //
165 self.index_iterator(h1, h2)
166 .all(|(byte, mask)| self.buffer[byte] & mask != 0)
167 }
168}
169
170/// Return 2 hashes for `item` that can be used as h1 and h2 fordouble hashing.
171/// See https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing for details.
172#[inline]
173fn double_hashing_hashes<T: Hash>(item: T) -> (u64, u64) {
174 let mut hasher = stable_hasher::StableHasher::new();
175 item.hash(&mut hasher);
176 let h1 = hasher.finish();
177
178 // Write a nul byte to the existing state and get another hash.
179 // This is appropriate when using a very high quality hasher,
180 // which we know is the case.
181 0u8.hash(&mut hasher);
182 // h2 hash shouldn't be 0 for double hashing
183 let h2 = hasher.finish().max(1);
184
185 (h1, h2)
186}
187
188// From the paper:
189// Considering the choice of s (GROWTH_FACTOR) = 2 for small expected growth and s = 4
190// for larger growth, one can see that r (TIGHTENING_RATIO) around 0.8 – 0.9 is a sensible choice.
191// Here we select good defaults for 10~1000x growth.
192const DEFAULT_GROWTH_FACTOR: usize = 2;
193const DEFAULT_TIGHTENING_RATIO: f64 = 0.8515625; // ~0.85 but has exact representation in f32/f64
194
195const fn default_growth_factor() -> usize {
196 DEFAULT_GROWTH_FACTOR
197}
198
199const fn default_tightening_ratio() -> f64 {
200 DEFAULT_TIGHTENING_RATIO
201}
202
203/// A Growable Bloom Filter
204///
205/// # Overview
206///
207/// Implementation of [Scalable Bloom Filters](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.62.7953&rep=rep1&type=pdf)
208/// which also provides serde serialization and deserialize.
209///
210/// A bloom filter lets you `insert` items, and then test association with `contains`.
211/// It's space and time efficient, at the cost of false positives.
212/// In particular, if `contains` returns `true`, it may be in filter.
213/// But if `contains` returns false, it's definitely not in the bloom filter.
214///
215/// You can control the failure rate by setting `desired_error_prob` and `est_insertions` appropriately.
216///
217/// # Applications
218///
219/// Bloom filters are typically used as a pre-cache to avoid expensive operations.
220/// For example, if you need to ask ten thousand servers if they have data XYZ,
221/// you could use GrowableBloom to figure out which ones do NOT have XYZ.
222///
223/// # Example
224///
225/// ```rust
226/// use growable_bloom_filter::GrowableBloom;
227///
228/// // Create and insert into the bloom filter
229/// let mut gbloom = GrowableBloom::new(0.05, 1000);
230/// gbloom.insert(&0);
231/// assert!(gbloom.contains(&0));
232///
233/// // Serialize and Deserialize the bloom filter
234/// use serde_json;
235///
236/// let s = serde_json::to_string(&gbloom).unwrap();
237/// let des_gbloom: GrowableBloom = serde_json::from_str(&s).unwrap();
238/// assert!(des_gbloom.contains(&0));
239///
240/// // Builder API
241/// use growable_bloom_filter::GrowableBloomBuilder;
242/// let mut gbloom = GrowableBloomBuilder::new()
243/// .estimated_insertions(100)
244/// .desired_error_ratio(0.05)
245/// .build();
246/// gbloom.insert(&0);
247/// assert!(gbloom.contains(&0));
248/// ```
249#[derive(Deserialize, Serialize, PartialEq, Clone, Debug)]
250pub struct GrowableBloom {
251 /// The constituent bloom filters
252 #[serde(rename = "b")]
253 blooms: Vec<Bloom>,
254 #[serde(rename = "e")]
255 desired_error_prob: f64,
256 #[serde(rename = "t")]
257 est_insertions: usize,
258 /// Number of items successfully inserted
259 #[serde(rename = "i")]
260 inserts: usize,
261 /// Item capacity
262 #[serde(rename = "c")]
263 capacity: usize,
264 /// Growth factor
265 #[serde(rename = "g", default = "default_growth_factor")]
266 growth_factor: usize,
267 #[serde(rename = "r", default = "default_tightening_ratio")]
268 tightening_ratio: f64,
269}
270
271impl GrowableBloom {
272 /// Create a new GrowableBloom filter.
273 ///
274 /// # Arguments
275 ///
276 /// * `desired_error_prob` - The desired error probability (eg. 0.05, 0.01)
277 /// * `est_insertions` - The estimated number of insertions (eg. 100, 1000).
278 ///
279 /// Note: You really don't need to be accurate with est_insertions.
280 /// Power of 10 granularity should be fine (~1000 is decent).
281 ///
282 /// # Example
283 ///
284 /// ```rust
285 /// // 5% failure rate, estimated 100 elements to insert
286 /// use growable_bloom_filter::GrowableBloom;
287 /// let mut gbloom = GrowableBloom::new(0.05, 100);
288 /// ```
289 ///
290 /// # Panics
291 ///
292 /// * Panics if desired_error_prob is less then 0 or greater than 1.
293 /// * Panics if capacity is zero. If you're unsure, set it to 1000.
294 ///
295 /// # Builder API
296 /// An alternative way to construct a GrowableBloom.
297 ///
298 /// See [`GrowableBloomBuilder`] for documentation. It allows you to specify
299 /// other constants to control bloom filter behaviour.
300 ///
301 /// ```rust
302 /// use growable_bloom_filter::GrowableBloomBuilder;
303 /// let mut gbloom = GrowableBloomBuilder::new()
304 /// .estimated_insertions(100)
305 /// .desired_error_ratio(0.05)
306 /// .build();
307 /// ```
308 #[inline]
309 pub fn new(desired_error_prob: f64, est_insertions: usize) -> GrowableBloom {
310 Self::new_with_internals(
311 desired_error_prob,
312 est_insertions,
313 DEFAULT_GROWTH_FACTOR,
314 DEFAULT_TIGHTENING_RATIO,
315 )
316 }
317
318 pub(crate) fn new_with_internals(
319 desired_error_prob: f64,
320 est_insertions: usize,
321 growth_factor: usize,
322 tightening_ratio: f64,
323 ) -> GrowableBloom {
324 assert!(0.0 < desired_error_prob && desired_error_prob < 1.0);
325 assert!(growth_factor > 1);
326 GrowableBloom {
327 blooms: vec![],
328 desired_error_prob,
329 est_insertions,
330 inserts: 0,
331 capacity: 0,
332 growth_factor,
333 tightening_ratio,
334 }
335 }
336
337 /// Test if `item` in the Bloom filter.
338 ///
339 /// If `true` is returned, it _may_ be in the filter.
340 /// If `false` is returned, it's NOT in the filter.
341 ///
342 /// # Arguments
343 ///
344 /// * `item` - The item to test
345 ///
346 /// # Example
347 ///
348 /// ```rust
349 /// use growable_bloom_filter::GrowableBloom;
350 /// let mut bloom = GrowableBloom::new(0.05, 10);
351 /// let item = 0;
352 ///
353 /// bloom.insert(&item);
354 /// assert!(bloom.contains(&item));
355 /// ```
356 pub fn contains<T: Hash>(&self, item: T) -> bool {
357 let (h1, h2) = double_hashing_hashes(item);
358 self.blooms.iter().any(|bloom| bloom.contains(h1, h2))
359 }
360
361 /// Insert `item` into the filter.
362 ///
363 /// This may resize the GrowableBloom.
364 ///
365 /// # Arguments
366 ///
367 /// * `item` - The item to insert
368 ///
369 /// # Example
370 ///
371 /// ```rust
372 /// use growable_bloom_filter::GrowableBloom;
373 /// let mut bloom = GrowableBloom::new(0.05, 10);
374 /// let item = 0;
375 ///
376 /// bloom.insert(&item);
377 /// bloom.insert(&-1);
378 /// bloom.insert(&vec![1, 2, 3]);
379 /// bloom.insert("hello");
380 /// ```
381 pub fn insert<T: Hash>(&mut self, item: T) -> bool {
382 let (h1, h2) = double_hashing_hashes(item);
383 // Step 1: Ask if we already have it
384 if self.blooms.iter().any(|bloom| bloom.contains(h1, h2)) {
385 return false;
386 }
387 // Step 2: Grow if necessary
388 if self.inserts >= self.capacity {
389 self.grow();
390 }
391 // Step 3: Insert it into the last
392 self.inserts += 1;
393 let curr_bloom = self.blooms.last_mut().unwrap();
394 curr_bloom.insert(h1, h2);
395 true
396 }
397
398 /// Clear the bloom filter.
399 ///
400 /// This does not resize the filter.
401 ///
402 /// # Example
403 ///
404 /// ```rust
405 /// use growable_bloom_filter::GrowableBloom;
406 /// let mut bloom = GrowableBloom::new(0.05, 10);
407 /// let item = 0;
408 ///
409 /// bloom.insert(&item);
410 /// assert!(bloom.contains(&item));
411 /// bloom.clear();
412 /// assert!(!bloom.contains(&item)); // No longer contains item
413 /// ```
414 pub fn clear(&mut self) {
415 self.blooms.clear();
416 self.inserts = 0;
417 self.capacity = 0;
418 }
419
420 /// Whether this bloom filter contain any items.
421 #[inline]
422 pub fn is_empty(&self) -> bool {
423 self.inserts == 0
424 }
425
426 /// The current estimated number of elements added to the filter.
427 /// This is an estimation, so it may or may not increase after
428 /// an insertion in the filter.
429 ///
430 /// # Example
431 ///
432 /// ```rust
433 /// use growable_bloom_filter::GrowableBloom;
434 /// let mut bloom = GrowableBloom::new(0.05, 10);
435 ///
436 /// bloom.insert(0);
437 /// assert_eq!(bloom.len(), 1);
438 /// ```
439 #[inline]
440 pub fn len(&self) -> usize {
441 self.inserts
442 }
443
444 /// The current estimated capacity of the filter.
445 /// A filter starts with a small capacity and will expand to accommodate more items.
446 /// The actual ratio of increase depends on the values used to construct the bloom filter.
447 ///
448 /// Note: An empty filter has capacity zero as we haven't calculated
449 /// the necessary bloom filter size. Subsequent inserts will result
450 /// in the capacity updating.
451 ///
452 /// # Example
453 ///
454 /// ```rust
455 /// use growable_bloom_filter::GrowableBloom;
456 /// let mut bloom = GrowableBloom::new(0.05, 10);
457 ///
458 /// assert_eq!(bloom.capacity(), 0);
459 ///
460 /// bloom.insert(0);
461 /// // After an insert, our capacity is no longer zero.
462 /// assert_ne!(bloom.capacity(), 0);
463 /// ```
464 #[inline]
465 pub fn capacity(&self) -> usize {
466 self.capacity
467 }
468
469 /// Record if `item` already exists in the filter, and insert it if it doesn't already exist.
470 ///
471 /// Returns `true` if the item already existed in the filter.
472 ///
473 /// Note: This isn't faster than just inserting.
474 ///
475 /// # Example
476 ///
477 /// ```rust
478 /// use growable_bloom_filter::GrowableBloom;
479 /// let mut bloom = GrowableBloom::new(0.05, 10);
480 /// let item = 0;
481 ///
482 /// let existed_before = bloom.check_and_set(&item);
483 /// assert!(existed_before == false);
484 ///
485 /// let existed_before = bloom.check_and_set(&item);
486 /// assert!(existed_before == true);
487 /// ```
488 pub fn check_and_set<T: Hash>(&mut self, item: T) -> bool {
489 !self.insert(item)
490 }
491
492 /// Grow the GrowableBloom
493 fn grow(&mut self) {
494 // The paper gives an upper bound formula for the fp rate: fpUB <= fp0 * / (1-r)
495 // This is because each sub bloom filter is created with an ever smaller
496 // false-positive ratio, forming a geometric progression.
497 // let r = TIGHTENING_RATIO
498 // fpUB ~= fp0 * fp0*r * fp0*r*r * fp0*r*r*r ...
499 // fp(x) = fp0 * (r**x)
500 let error_ratio =
501 self.desired_error_prob * self.tightening_ratio.powi(self.blooms.len() as _);
502 // In order to have relatively small space overhead compared to a single appropriately sized bloom filter
503 // the sub filters should be created with increasingly bigger sizes.
504 // let s = GROWTH_FACTOR
505 // cap(x) = cap0 * (s**x)
506 let capacity = self.est_insertions * self.growth_factor.pow(self.blooms.len() as _);
507 let new_bloom = Bloom::new(capacity, error_ratio);
508 self.blooms.push(new_bloom);
509 self.capacity += capacity;
510 }
511}
512
513/// Builder API for GrowableBloom.
514///
515/// ```rust
516/// use growable_bloom_filter::GrowableBloomBuilder;
517/// let mut gbloom = GrowableBloomBuilder::new()
518/// .estimated_insertions(100)
519/// .desired_error_ratio(0.05)
520/// .build();
521/// ```
522pub struct GrowableBloomBuilder {
523 desired_error_ratio: f64,
524 est_insertions: usize,
525 growth_factor: usize,
526 tightening_ratio: f64,
527}
528
529impl GrowableBloomBuilder {
530 /// Create a new GrowableBloomBuilder.
531 ///
532 /// Builder API for GrowableBloom.
533 ///
534 /// ```rust
535 /// use growable_bloom_filter::GrowableBloomBuilder;
536 /// let mut gbloom = GrowableBloomBuilder::new()
537 /// .estimated_insertions(1000)
538 /// .desired_error_ratio(0.01)
539 /// .growth_factor(2)
540 /// .tightening_ratio(0.85)
541 /// .build();
542 /// gbloom.insert("hello world");
543 /// assert!(gbloom.contains(&"hello world"));
544 /// ```
545 pub fn new() -> Self {
546 Self {
547 est_insertions: 1000,
548 desired_error_ratio: 0.01,
549 growth_factor: DEFAULT_GROWTH_FACTOR,
550 tightening_ratio: DEFAULT_TIGHTENING_RATIO,
551 }
552 }
553
554 /// Estimated number of insertions. A power of ten accuracy is good enough.
555 ///
556 /// # Panics
557 ///
558 /// This will panic in debug mode if count is zero.
559 pub fn estimated_insertions(self, count: usize) -> Self {
560 Self {
561 est_insertions: count,
562 ..self
563 }
564 }
565
566 /// Desired error ratio (i.e. false positive rate).
567 ///
568 /// Smaller error ratios will use more memory and might be a bit slower.
569 ///
570 /// # Panics
571 ///
572 /// This will panic if the error ratio is outside of (0, 1.0).
573 pub fn desired_error_ratio(self, ratio: f64) -> Self {
574 Self {
575 desired_error_ratio: ratio,
576 ..self
577 }
578 }
579
580 /// Base for the exponential growth factor.
581 ///
582 /// As more items are inserted into a GrowableBloom this growth_factor
583 /// number is used to exponentially grow the capacity of newly added
584 /// internal bloom filters. So this number is raised to some exponent proportional
585 /// to the number of bloom filters held internally.
586 ///
587 /// Basically it'll control how quickly the bloom filter grows in capacity.
588 /// By default it's set to two.
589 pub fn growth_factor(self, factor: usize) -> Self {
590 Self {
591 growth_factor: factor,
592 ..self
593 }
594 }
595
596 /// Control the downwards adjustment on the error ratio when growing.
597 ///
598 /// When GrowableBloom adds a new internal bloom filter it uses
599 /// the tightening_ratio to adjust the desired_error_ratio on these
600 /// new, larger internal bloom filters. This is necessary to achieve decent
601 /// accuracy on the user's desired error_ratio while using larger and larger
602 /// bloom filters internally.
603 ///
604 /// By default this library sets it to ~0.85, but for smaller growth factors
605 /// any number around 0.8 - 0.9 should be fine.
606 pub fn tightening_ratio(self, ratio: f64) -> Self {
607 assert!(0.0 < ratio && ratio < 1.0);
608 Self {
609 tightening_ratio: ratio,
610 ..self
611 }
612 }
613
614 /// Consume the builder to create a GrowableBloom.
615 ///
616 /// # Panics
617 ///
618 /// This will panic if an invalid value is specified.
619 pub fn build(self) -> GrowableBloom {
620 GrowableBloom::new_with_internals(
621 self.desired_error_ratio,
622 self.est_insertions,
623 self.growth_factor,
624 self.tightening_ratio,
625 )
626 }
627}
628
629#[cfg(test)]
630mod growable_bloom_tests {
631 mod test_bloom {
632 use crate::{double_hashing_hashes, Bloom};
633
634 #[test]
635 fn can_insert_bloom() {
636 let mut b = Bloom::new(100, 0.01);
637 let (h1, h2) = double_hashing_hashes(123);
638 b.insert(h1, h2);
639 assert!(b.contains(h1, h2))
640 }
641
642 #[test]
643 fn can_insert_string_bloom() {
644 let mut b = Bloom::new(100, 0.01);
645 let (h1, h2) = double_hashing_hashes("hello world".to_string());
646 b.insert(h1, h2);
647 assert!(b.contains(h1, h2))
648 }
649
650 #[test]
651 fn does_not_contain() {
652 let mut b = Bloom::new(100, 0.01);
653 let upper = 100;
654 for i in (0..upper).step_by(2) {
655 let (h1, h2) = double_hashing_hashes(i);
656 b.insert(h1, h2);
657 assert!(b.contains(h1, h2))
658 }
659 for i in (1..upper).step_by(2) {
660 let (h1, h2) = double_hashing_hashes(i);
661 assert!(!b.contains(h1, h2))
662 }
663 }
664 #[test]
665 fn can_insert_lots() {
666 let mut b = Bloom::new(100, 0.01);
667 for i in 0..1024 {
668 let (h1, h2) = double_hashing_hashes(i);
669 b.insert(h1, h2);
670 assert!(b.contains(h1, h2))
671 }
672 }
673 #[test]
674 fn test_refs() {
675 let item = String::from("Hello World");
676 let mut b = Bloom::new(100, 0.01);
677 let (h1, h2) = double_hashing_hashes(&item);
678 b.insert(h1, h2);
679 assert!(b.contains(h1, h2))
680 }
681 }
682
683 mod test_growable {
684 use crate::{GrowableBloom, DEFAULT_TIGHTENING_RATIO};
685 use serde_json;
686
687 #[test]
688 fn can_insert() {
689 let mut b = GrowableBloom::new(0.05, 1000);
690 let item = 20;
691 b.insert(&item);
692 assert!(b.contains(&item))
693 }
694
695 #[test]
696 fn len_capacity_clear() {
697 let mut b = GrowableBloom::new(0.05, 100);
698 assert_eq!(b.len(), 0);
699 assert_eq!(b.capacity(), 0);
700
701 let item = 20;
702 b.insert(&item);
703 assert_ne!(b.len(), 0);
704 assert_ne!(b.capacity(), 0);
705
706 b.clear();
707 assert_eq!(b.len(), 0);
708 assert_eq!(b.capacity(), 0);
709 }
710
711 #[test]
712 fn ensure_capacity() {
713 let mut b = GrowableBloom::new(0.05, 1);
714 assert_eq!(b.capacity(), 0);
715 b.insert("abc");
716 assert_eq!(b.capacity(), 1);
717 for i in 0..100 {
718 b.insert(i);
719 }
720 assert_eq!(b.capacity(), 127);
721 }
722
723 #[test]
724 fn can_insert_string() {
725 let mut b = GrowableBloom::new(0.05, 1000);
726 let item: String = "hello world".to_owned();
727 b.insert(&item);
728 assert!(b.contains(&item))
729 }
730
731 #[test]
732 fn does_not_contain() {
733 let mut b = GrowableBloom::new(0.05, 1000);
734 assert_eq!(b.contains(&"hello"), false);
735 b.insert(&0);
736 assert_eq!(b.contains(&"hello"), false);
737 b.insert(&1);
738 assert_eq!(b.contains(&"hello"), false);
739 b.insert(&2);
740 assert_eq!(b.contains(&"hello"), false);
741 }
742
743 #[test]
744 fn can_insert_a_lot_of_elements() {
745 let mut b = GrowableBloom::new(0.05, 1000);
746 for i in 0..1000 {
747 b.insert(&i);
748 assert!(b.contains(&i));
749 }
750 }
751
752 #[test]
753 fn can_serialize_deserialize() {
754 let mut b = GrowableBloom::new(0.05, 1000);
755 b.insert(&0);
756 let s = serde_json::to_string(&b).unwrap();
757 let b_s: GrowableBloom = serde_json::from_str(&s).unwrap();
758 assert!(b_s.contains(&0));
759 assert_ne!(b_s.contains(&1), true);
760 assert_ne!(b_s.contains(&1000), true);
761 }
762
763 #[test]
764 fn verify_saturation() {
765 for &fp in &[0.01, 0.001] {
766 // The paper gives an upper bound formula for the fp rate: fpUB <= fp0*/(1-r)
767 let fp_ub = fp / (1.0 - DEFAULT_TIGHTENING_RATIO);
768 let initial_cap = 100u64;
769 let growth = 1000u64;
770 let mut b = GrowableBloom::new(fp, initial_cap as usize);
771 // insert 1000x more elements than initially allocated
772 for i in 1u64..=initial_cap * growth {
773 b.insert(&i);
774
775 if i % (initial_cap * growth / 10) == 0
776 || [1, 2, 5, 10, 25].iter().any(|&g| i == initial_cap * g)
777 {
778 // A lot of tests are required to get a good estimate
779 let est_fp_rate = (i + 1..).take(50_000).filter(|i| b.contains(i)).count()
780 as f64
781 / 50_000.0;
782
783 // Uncomment the following to get good output for experiments
784 // println!(
785 // "{}x cap: {}fp ({}x)",
786 // i / initial_cap,
787 // est_fp_rate,
788 // est_fp_rate / fp
789 // );
790 assert!(est_fp_rate <= fp_ub);
791 }
792 }
793 for i in 1u64..=initial_cap * growth {
794 assert!(b.contains(&i));
795 }
796 }
797 }
798
799 #[test]
800 fn test_types_saturation() {
801 let mut b = GrowableBloom::new(0.50, 100);
802 b.insert(&vec![1, 2, 3]);
803 b.insert("hello");
804 b.insert(&-1);
805 b.insert(&0);
806 }
807
808 #[test]
809 fn can_check_and_set() {
810 let mut b = GrowableBloom::new(0.05, 1000);
811 let item = 20;
812 assert!(!b.check_and_set(&item));
813 assert!(b.check_and_set(&item));
814 }
815 }
816
817 mod test_builder {
818 use crate::GrowableBloomBuilder;
819
820 #[test]
821 fn can_build_bloom() {
822 let mut gbloom = GrowableBloomBuilder::new().build();
823 gbloom.insert(3);
824 assert!(gbloom.contains(&3));
825 }
826 #[test]
827 #[should_panic]
828 fn should_panic_on_bad_error_ratio() {
829 GrowableBloomBuilder::new()
830 .estimated_insertions(1000)
831 .desired_error_ratio(99.9)
832 .build();
833 }
834 #[test]
835 #[should_panic]
836 fn should_panic_on_too_small_tightening_ratio() {
837 GrowableBloomBuilder::new().tightening_ratio(0.0).build();
838 }
839 #[test]
840 #[should_panic]
841 fn should_panic_on_too_large_tightening_ratio() {
842 GrowableBloomBuilder::new().tightening_ratio(10.0).build();
843 }
844 #[test]
845 fn can_specify_all_values() {
846 // From https://github.com/dpbriggs/growable-bloom-filters/issues/7
847 let mut gbloom = GrowableBloomBuilder::new()
848 .estimated_insertions(3)
849 .desired_error_ratio(0.00001)
850 .tightening_ratio(0.5)
851 .growth_factor(2)
852 .build();
853 for i in 0..100 {
854 gbloom.insert(i);
855 }
856 for i in 0..100 {
857 assert!(gbloom.contains(&i));
858 }
859 }
860 }
861
862 #[cfg(feature = "nightly")]
863 mod bench {
864 use crate::GrowableBloom;
865 use test::Bencher;
866 #[bench]
867 fn bench_new(b: &mut Bencher) {
868 b.iter(|| GrowableBloom::new(0.01, 1000));
869 }
870 #[bench]
871 fn bench_insert_normal_prob(b: &mut Bencher) {
872 let mut gbloom = GrowableBloom::new(0.01, 1000);
873 b.iter(|| gbloom.insert(10));
874 }
875 #[bench]
876 fn bench_insert_small_prob(b: &mut Bencher) {
877 let mut gbloom = GrowableBloom::new(0.001, 1000);
878 b.iter(|| gbloom.insert(10));
879 }
880 #[bench]
881 fn bench_many(b: &mut Bencher) {
882 let mut gbloom = GrowableBloom::new(0.01, 100000);
883 b.iter(|| gbloom.insert(10));
884 }
885 #[bench]
886 fn bench_insert_medium(b: &mut Bencher) {
887 let s: String = (0..100).map(|_| 'X').collect();
888 let mut gbloom = GrowableBloom::new(0.01, 100000);
889 b.iter(|| gbloom.insert(&s))
890 }
891 #[bench]
892 fn bench_insert_large(b: &mut Bencher) {
893 let s: String = (0..10000).map(|_| 'X').collect();
894 let mut gbloom = GrowableBloom::new(0.01, 100000);
895 b.iter(|| gbloom.insert(&s))
896 }
897 #[bench]
898 fn bench_insert_large_very_small_prob(b: &mut Bencher) {
899 let s: String = (0..10000).map(|_| 'X').collect();
900 let mut gbloom = GrowableBloom::new(0.0001, 100000);
901 b.iter(|| gbloom.insert(&s))
902 }
903 #[bench]
904 fn bench_grow(b: &mut Bencher) {
905 b.iter(|| {
906 let mut gbloom = GrowableBloom::new(0.01, 100);
907 for i in 0..1000 {
908 gbloom.insert(&i);
909 assert!(gbloom.contains(&i));
910 }
911 })
912 }
913 }
914}