growable_bloom_filter/
lib.rs

1#![deny(unsafe_code)]
2#![cfg_attr(feature = "nightly", feature(test))]
3///! Impl of Scalable Bloom Filters
4///! http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.62.7953&rep=rep1&type=pdf
5
6#[cfg(feature = "nightly")]
7extern crate test;
8
9use serde_derive::{Deserialize, Serialize};
10use std::{
11    hash::{Hash, Hasher},
12    iter::Iterator,
13    num::NonZeroU64,
14};
15
16mod stable_hasher;
17
18/// Base Bloom Filter
19#[derive(Deserialize, Serialize, PartialEq, Clone, Debug)]
20struct Bloom {
21    /// The actual bit field. Set to 0 with `Bloom::new`.
22    #[serde(rename = "b", with = "serde_bytes")]
23    buffer: Box<[u8]>,
24    /// The number of slices in the partitioned bloom filter.
25    /// Equivalent to the number of hash function in the classic bloom filter.
26    /// An insertion will result in a bit being set in each slice.
27    #[serde(rename = "k")]
28    num_slices: NonZeroU64,
29}
30
31impl Bloom {
32    /// Create a new Bloom filter (specifically, a Partitioned Bloom filter)
33    ///
34    /// # Arguments
35    ///
36    /// * `capacity` - target capacity. Panics if `capacity` is zero.
37    /// * `error_ratio` - false positive ratio (0..1.0).
38    fn new(capacity: usize, error_ratio: f64) -> Bloom {
39        // Directly from paper:
40        // k = log2(1/P)   (num_slices)
41        // n ≈ −m ln(1−p)  (slice_len_bits)
42        // M = k * m       (total_bits)
43        // for optimal filter p = 0.5, which gives:
44        // n ≈ −m ln(0.5), rearranging: m = -n / ln(0.5) = n / ln(2)
45        debug_assert!(capacity >= 1);
46        debug_assert!(0.0 < error_ratio && error_ratio < 1.0);
47        // We're using ceil instead of round in order to get an error rate <= the desired.
48        // Using round can result in significantly higher error rates.
49        let num_slices = ((1.0 / error_ratio).log2()).ceil() as u64;
50        let slice_len_bits = (capacity as f64 / 2f64.ln()).ceil() as u64;
51        let total_bits = num_slices * slice_len_bits;
52        // round up to the next byte
53        let buffer_bytes = ((total_bits + 7) / 8) as usize;
54
55        let mut buffer = Vec::with_capacity(buffer_bytes);
56        buffer.resize(buffer_bytes, 0);
57        Bloom {
58            buffer: buffer.into_boxed_slice(),
59            num_slices: NonZeroU64::new(num_slices).unwrap(),
60        }
61    }
62
63    /// Create an index iterator for a given item.
64    ///
65    /// This creates an iterator of pairs `(byte, mask)` indices in the buffer.
66    /// The iterator will return one pair of indexes for each slice in the bloom filter.
67    ///
68    /// The pairs `(byte idx, byte mask)` are:
69    ///     byte idx: byte idx in `self.buffer` to be extract for usage with the mask
70    ///     byte mask: bit mask with a single bit set, can be ANDed (`&`) with
71    ///                self.buffer[idx] to yield a number != 0 if the specified bit was set.
72    ///                The mask can also be ORed (`|`) with the self.buffer[idx]
73    ///                to set the corresponding bit.
74    ///
75    /// # Arguments
76    ///
77    /// * `item` - The item to hash.
78    #[inline]
79    fn index_iterator(&self, mut h1: u64, mut h2: u64) -> impl Iterator<Item = (usize, u8)> {
80        // The _bit_ length (thus buffer.len() multiplied by 8) of each slice within buffer.
81        // We'll use a NonZero type so that the compiler can avoid checking for
82        // division/modulus by 0 inside the iterator.
83        let slice_len = NonZeroU64::new(self.buffer.len() as u64 * 8 / self.num_slices).unwrap();
84
85        // Generate `self.num_slices` hashes from 2 hashes, using enhanced double hashing.
86        // See https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing for details.
87        // We choose to use 2x64 bit hashes instead of 2x32 ones as it gives significant better false positive ratios.
88        debug_assert_ne!(h2, 0, "Second hash can't be 0 for double hashing");
89        (0..self.num_slices.get()).map(move |i| {
90            // Calculate hash(i)
91            let hi = h1 % slice_len + i * slice_len.get();
92            // Advance enhanced double hashing state
93            h1 = h1.wrapping_add(h2);
94            h2 = h2.wrapping_add(i);
95            // Resulting index/mask based on hash(i)
96            let idx = (hi / 8) as usize;
97            let mask = 1u8 << (hi % 8);
98            (idx, mask)
99        })
100    }
101
102    /// Insert an item identified by two hashes is in the Bloom.
103    /// # Arguments
104    ///
105    /// * `h1` - The main hash
106    /// * `h2` - The second hash (must be != 0)
107    ///
108    /// # Example
109    ///
110    ///
111    /// use growable_bloom_filter::Bloom;
112    /// let bloom = Bloom::new(2, 128);
113    ///
114    /// let (h1, h2) = double_hashing_hashes("my-item");
115    /// bloom.insert(h1, h2);
116    ///
117    #[inline]
118    fn insert(&mut self, h1: u64, h2: u64) {
119        // Set all bits (one per slice) corresponding to this item.
120        //
121        // Setting the bit:
122        //    1000 0011 (self.buffer[idx])
123        //    0001 0000 (mask)
124        //    |---------
125        //    1001 0011
126        //
127        for (byte, mask) in self.index_iterator(h1, h2) {
128            self.buffer[byte] |= mask;
129        }
130    }
131
132    /// Test if item identified by two hashes is in the Bloom.
133    ///
134    /// # Arguments
135    ///
136    /// * `h1` - The main hash
137    /// * `h2` - The second hash (must be != 0)
138    ///
139    /// # Example
140    ///
141    /// let bloom = Bloom:new(2, 128);
142    ///
143    /// let (h1, h2) = double_hashing_hashes("my-item");
144    /// bloom.insert(h1, h2);
145    ///
146    /// assert!(bloom.contains(h1, h2));
147    ///
148    #[inline]
149    fn contains(&self, h1: u64, h2: u64) -> bool {
150        // Check if all bits (one per slice) corresponding to this item are set.
151        // See index_iterator comments for a detailed explanation.
152        //
153        // Potentially found case:
154        //    0111 1111 (self.buffer[idx])
155        //    0001 0000 (mask)
156        //    &---------
157        //    0001 0000 != 0
158        //
159        // Definitely not found case:
160        //    1110 1111 (self.buffer[idx])
161        //    0001 0000 (mask)
162        //    &---------
163        //    0000 0000 == 0
164        //
165        self.index_iterator(h1, h2)
166            .all(|(byte, mask)| self.buffer[byte] & mask != 0)
167    }
168}
169
170/// Return 2 hashes for `item` that can be used as h1 and h2 fordouble hashing.
171/// See https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing for details.
172#[inline]
173fn double_hashing_hashes<T: Hash>(item: T) -> (u64, u64) {
174    let mut hasher = stable_hasher::StableHasher::new();
175    item.hash(&mut hasher);
176    let h1 = hasher.finish();
177
178    // Write a nul byte to the existing state and get another hash.
179    // This is appropriate when using a very high quality hasher,
180    // which we know is the case.
181    0u8.hash(&mut hasher);
182    // h2 hash shouldn't be 0 for double hashing
183    let h2 = hasher.finish().max(1);
184
185    (h1, h2)
186}
187
188// From the paper:
189// Considering the choice of s (GROWTH_FACTOR) = 2 for small expected growth and s = 4
190// for larger growth, one can see that r (TIGHTENING_RATIO) around 0.8 – 0.9 is a sensible choice.
191// Here we select good defaults for 10~1000x growth.
192const DEFAULT_GROWTH_FACTOR: usize = 2;
193const DEFAULT_TIGHTENING_RATIO: f64 = 0.8515625; // ~0.85 but has exact representation in f32/f64
194
195const fn default_growth_factor() -> usize {
196    DEFAULT_GROWTH_FACTOR
197}
198
199const fn default_tightening_ratio() -> f64 {
200    DEFAULT_TIGHTENING_RATIO
201}
202
203/// A Growable Bloom Filter
204///
205/// # Overview
206///
207/// Implementation of [Scalable Bloom Filters](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.62.7953&rep=rep1&type=pdf)
208/// which also provides serde serialization and deserialize.
209///
210/// A bloom filter lets you `insert` items, and then test association with `contains`.
211/// It's space and time efficient, at the cost of false positives.
212/// In particular, if `contains` returns `true`, it may be in filter.
213/// But if `contains` returns false, it's definitely not in the bloom filter.
214///
215/// You can control the failure rate by setting `desired_error_prob` and `est_insertions` appropriately.
216///
217/// # Applications
218///
219/// Bloom filters are typically used as a pre-cache to avoid expensive operations.
220/// For example, if you need to ask ten thousand servers if they have data XYZ,
221/// you could use GrowableBloom to figure out which ones do NOT have XYZ.
222///
223/// # Example
224///
225/// ```rust
226/// use growable_bloom_filter::GrowableBloom;
227///
228/// // Create and insert into the bloom filter
229/// let mut gbloom = GrowableBloom::new(0.05, 1000);
230/// gbloom.insert(&0);
231/// assert!(gbloom.contains(&0));
232///
233/// // Serialize and Deserialize the bloom filter
234/// use serde_json;
235///
236/// let s = serde_json::to_string(&gbloom).unwrap();
237/// let des_gbloom: GrowableBloom = serde_json::from_str(&s).unwrap();
238/// assert!(des_gbloom.contains(&0));
239///
240/// // Builder API
241/// use growable_bloom_filter::GrowableBloomBuilder;
242/// let mut gbloom = GrowableBloomBuilder::new()
243///     .estimated_insertions(100)
244///     .desired_error_ratio(0.05)
245///     .build();
246/// gbloom.insert(&0);
247/// assert!(gbloom.contains(&0));
248/// ```
249#[derive(Deserialize, Serialize, PartialEq, Clone, Debug)]
250pub struct GrowableBloom {
251    /// The constituent bloom filters
252    #[serde(rename = "b")]
253    blooms: Vec<Bloom>,
254    #[serde(rename = "e")]
255    desired_error_prob: f64,
256    #[serde(rename = "t")]
257    est_insertions: usize,
258    /// Number of items successfully inserted
259    #[serde(rename = "i")]
260    inserts: usize,
261    /// Item capacity
262    #[serde(rename = "c")]
263    capacity: usize,
264    /// Growth factor
265    #[serde(rename = "g", default = "default_growth_factor")]
266    growth_factor: usize,
267    #[serde(rename = "r", default = "default_tightening_ratio")]
268    tightening_ratio: f64,
269}
270
271impl GrowableBloom {
272    /// Create a new GrowableBloom filter.
273    ///
274    /// # Arguments
275    ///
276    /// * `desired_error_prob` - The desired error probability (eg. 0.05, 0.01)
277    /// * `est_insertions` - The estimated number of insertions (eg. 100, 1000).
278    ///
279    /// Note: You really don't need to be accurate with est_insertions.
280    ///       Power of 10 granularity should be fine (~1000 is decent).
281    ///
282    /// # Example
283    ///
284    /// ```rust
285    /// // 5% failure rate, estimated 100 elements to insert
286    /// use growable_bloom_filter::GrowableBloom;
287    /// let mut gbloom = GrowableBloom::new(0.05, 100);
288    /// ```
289    ///
290    /// # Panics
291    ///
292    /// * Panics if desired_error_prob is less then 0 or greater than 1.
293    /// * Panics if capacity is zero. If you're unsure, set it to 1000.
294    ///
295    /// # Builder API
296    /// An alternative way to construct a GrowableBloom.
297    ///
298    /// See [`GrowableBloomBuilder`] for documentation. It allows you to specify
299    /// other constants to control bloom filter behaviour.
300    ///
301    /// ```rust
302    /// use growable_bloom_filter::GrowableBloomBuilder;
303    /// let mut gbloom = GrowableBloomBuilder::new()
304    ///     .estimated_insertions(100)
305    ///     .desired_error_ratio(0.05)
306    ///     .build();
307    /// ```
308    #[inline]
309    pub fn new(desired_error_prob: f64, est_insertions: usize) -> GrowableBloom {
310        Self::new_with_internals(
311            desired_error_prob,
312            est_insertions,
313            DEFAULT_GROWTH_FACTOR,
314            DEFAULT_TIGHTENING_RATIO,
315        )
316    }
317
318    pub(crate) fn new_with_internals(
319        desired_error_prob: f64,
320        est_insertions: usize,
321        growth_factor: usize,
322        tightening_ratio: f64,
323    ) -> GrowableBloom {
324        assert!(0.0 < desired_error_prob && desired_error_prob < 1.0);
325        assert!(growth_factor > 1);
326        GrowableBloom {
327            blooms: vec![],
328            desired_error_prob,
329            est_insertions,
330            inserts: 0,
331            capacity: 0,
332            growth_factor,
333            tightening_ratio,
334        }
335    }
336
337    /// Test if `item` in the Bloom filter.
338    ///
339    /// If `true` is returned, it _may_ be in the filter.
340    /// If `false` is returned, it's NOT in the filter.
341    ///
342    /// # Arguments
343    ///
344    /// * `item` - The item to test
345    ///
346    /// # Example
347    ///
348    /// ```rust
349    /// use growable_bloom_filter::GrowableBloom;
350    /// let mut bloom = GrowableBloom::new(0.05, 10);
351    /// let item = 0;
352    ///
353    /// bloom.insert(&item);
354    /// assert!(bloom.contains(&item));
355    /// ```
356    pub fn contains<T: Hash>(&self, item: T) -> bool {
357        let (h1, h2) = double_hashing_hashes(item);
358        self.blooms.iter().any(|bloom| bloom.contains(h1, h2))
359    }
360
361    /// Insert `item` into the filter.
362    ///
363    /// This may resize the GrowableBloom.
364    ///
365    /// # Arguments
366    ///
367    /// * `item` - The item to insert
368    ///
369    /// # Example
370    ///
371    /// ```rust
372    /// use growable_bloom_filter::GrowableBloom;
373    /// let mut bloom = GrowableBloom::new(0.05, 10);
374    /// let item = 0;
375    ///
376    /// bloom.insert(&item);
377    /// bloom.insert(&-1);
378    /// bloom.insert(&vec![1, 2, 3]);
379    /// bloom.insert("hello");
380    /// ```
381    pub fn insert<T: Hash>(&mut self, item: T) -> bool {
382        let (h1, h2) = double_hashing_hashes(item);
383        // Step 1: Ask if we already have it
384        if self.blooms.iter().any(|bloom| bloom.contains(h1, h2)) {
385            return false;
386        }
387        // Step 2: Grow if necessary
388        if self.inserts >= self.capacity {
389            self.grow();
390        }
391        // Step 3: Insert it into the last
392        self.inserts += 1;
393        let curr_bloom = self.blooms.last_mut().unwrap();
394        curr_bloom.insert(h1, h2);
395        true
396    }
397
398    /// Clear the bloom filter.
399    ///
400    /// This does not resize the filter.
401    ///
402    /// # Example
403    ///
404    /// ```rust
405    /// use growable_bloom_filter::GrowableBloom;
406    /// let mut bloom = GrowableBloom::new(0.05, 10);
407    /// let item = 0;
408    ///
409    /// bloom.insert(&item);
410    /// assert!(bloom.contains(&item));
411    /// bloom.clear();
412    /// assert!(!bloom.contains(&item)); // No longer contains item
413    /// ```
414    pub fn clear(&mut self) {
415        self.blooms.clear();
416        self.inserts = 0;
417        self.capacity = 0;
418    }
419
420    /// Whether this bloom filter contain any items.
421    #[inline]
422    pub fn is_empty(&self) -> bool {
423        self.inserts == 0
424    }
425
426    /// The current estimated number of elements added to the filter.
427    /// This is an estimation, so it may or may not increase after
428    /// an insertion in the filter.
429    ///
430    /// # Example
431    ///
432    /// ```rust
433    /// use growable_bloom_filter::GrowableBloom;
434    /// let mut bloom = GrowableBloom::new(0.05, 10);
435    ///
436    /// bloom.insert(0);
437    /// assert_eq!(bloom.len(), 1);
438    /// ```
439    #[inline]
440    pub fn len(&self) -> usize {
441        self.inserts
442    }
443
444    /// The current estimated capacity of the filter.
445    /// A filter starts with a small capacity and will expand to accommodate more items.
446    /// The actual ratio of increase depends on the values used to construct the bloom filter.
447    ///
448    /// Note: An empty filter has capacity zero as we haven't calculated
449    ///       the necessary bloom filter size. Subsequent inserts will result
450    ///       in the capacity updating.
451    ///
452    /// # Example
453    ///
454    /// ```rust
455    /// use growable_bloom_filter::GrowableBloom;
456    /// let mut bloom = GrowableBloom::new(0.05, 10);
457    ///
458    /// assert_eq!(bloom.capacity(), 0);
459    ///
460    /// bloom.insert(0);
461    /// // After an insert, our capacity is no longer zero.
462    /// assert_ne!(bloom.capacity(), 0);
463    /// ```
464    #[inline]
465    pub fn capacity(&self) -> usize {
466        self.capacity
467    }
468
469    /// Record if `item` already exists in the filter, and insert it if it doesn't already exist.
470    ///
471    /// Returns `true` if the item already existed in the filter.
472    ///
473    /// Note: This isn't faster than just inserting.
474    ///
475    /// # Example
476    ///
477    /// ```rust
478    /// use growable_bloom_filter::GrowableBloom;
479    /// let mut bloom = GrowableBloom::new(0.05, 10);
480    /// let item = 0;
481    ///
482    /// let existed_before = bloom.check_and_set(&item);
483    /// assert!(existed_before == false);
484    ///
485    /// let existed_before = bloom.check_and_set(&item);
486    /// assert!(existed_before == true);
487    /// ```
488    pub fn check_and_set<T: Hash>(&mut self, item: T) -> bool {
489        !self.insert(item)
490    }
491
492    /// Grow the GrowableBloom
493    fn grow(&mut self) {
494        // The paper gives an upper bound formula for the fp rate: fpUB <= fp0 * / (1-r)
495        // This is because each sub bloom filter is created with an ever smaller
496        // false-positive ratio, forming a geometric progression.
497        // let r = TIGHTENING_RATIO
498        // fpUB ~= fp0 * fp0*r * fp0*r*r * fp0*r*r*r ...
499        // fp(x) = fp0 * (r**x)
500        let error_ratio =
501            self.desired_error_prob * self.tightening_ratio.powi(self.blooms.len() as _);
502        // In order to have relatively small space overhead compared to a single appropriately sized bloom filter
503        // the sub filters should be created with increasingly bigger sizes.
504        // let s = GROWTH_FACTOR
505        // cap(x) = cap0 * (s**x)
506        let capacity = self.est_insertions * self.growth_factor.pow(self.blooms.len() as _);
507        let new_bloom = Bloom::new(capacity, error_ratio);
508        self.blooms.push(new_bloom);
509        self.capacity += capacity;
510    }
511}
512
513/// Builder API for GrowableBloom.
514///
515/// ```rust
516/// use growable_bloom_filter::GrowableBloomBuilder;
517/// let mut gbloom = GrowableBloomBuilder::new()
518///     .estimated_insertions(100)
519///     .desired_error_ratio(0.05)
520///     .build();
521/// ```
522pub struct GrowableBloomBuilder {
523    desired_error_ratio: f64,
524    est_insertions: usize,
525    growth_factor: usize,
526    tightening_ratio: f64,
527}
528
529impl GrowableBloomBuilder {
530    /// Create a new GrowableBloomBuilder.
531    ///
532    /// Builder API for GrowableBloom.
533    ///
534    /// ```rust
535    /// use growable_bloom_filter::GrowableBloomBuilder;
536    /// let mut gbloom = GrowableBloomBuilder::new()
537    ///     .estimated_insertions(1000)
538    ///     .desired_error_ratio(0.01)
539    ///     .growth_factor(2)
540    ///     .tightening_ratio(0.85)
541    ///     .build();
542    /// gbloom.insert("hello world");
543    /// assert!(gbloom.contains(&"hello world"));
544    /// ```
545    pub fn new() -> Self {
546        Self {
547            est_insertions: 1000,
548            desired_error_ratio: 0.01,
549            growth_factor: DEFAULT_GROWTH_FACTOR,
550            tightening_ratio: DEFAULT_TIGHTENING_RATIO,
551        }
552    }
553
554    /// Estimated number of insertions. A power of ten accuracy is good enough.
555    ///
556    /// # Panics
557    ///
558    /// This will panic in debug mode if count is zero.
559    pub fn estimated_insertions(self, count: usize) -> Self {
560        Self {
561            est_insertions: count,
562            ..self
563        }
564    }
565
566    /// Desired error ratio (i.e. false positive rate).
567    ///
568    /// Smaller error ratios will use more memory and might be a bit slower.
569    ///
570    /// # Panics
571    ///
572    /// This will panic if the error ratio is outside of (0, 1.0).
573    pub fn desired_error_ratio(self, ratio: f64) -> Self {
574        Self {
575            desired_error_ratio: ratio,
576            ..self
577        }
578    }
579
580    /// Base for the exponential growth factor.
581    ///
582    /// As more items are inserted into a GrowableBloom this growth_factor
583    /// number is used to exponentially grow the capacity of newly added
584    /// internal bloom filters. So this number is raised to some exponent proportional
585    /// to the number of bloom filters held internally.
586    ///
587    /// Basically it'll control how quickly the bloom filter grows in capacity.
588    /// By default it's set to two.
589    pub fn growth_factor(self, factor: usize) -> Self {
590        Self {
591            growth_factor: factor,
592            ..self
593        }
594    }
595
596    /// Control the downwards adjustment on the error ratio when growing.
597    ///
598    /// When GrowableBloom adds a new internal bloom filter it uses
599    /// the tightening_ratio to adjust the desired_error_ratio on these
600    /// new, larger internal bloom filters. This is necessary to achieve decent
601    /// accuracy on the user's desired error_ratio while using larger and larger
602    /// bloom filters internally.
603    ///
604    /// By default this library sets it to ~0.85, but for smaller growth factors
605    /// any number around 0.8 - 0.9 should be fine.
606    pub fn tightening_ratio(self, ratio: f64) -> Self {
607        assert!(0.0 < ratio && ratio < 1.0);
608        Self {
609            tightening_ratio: ratio,
610            ..self
611        }
612    }
613
614    /// Consume the builder to create a GrowableBloom.
615    ///
616    /// # Panics
617    ///
618    /// This will panic if an invalid value is specified.
619    pub fn build(self) -> GrowableBloom {
620        GrowableBloom::new_with_internals(
621            self.desired_error_ratio,
622            self.est_insertions,
623            self.growth_factor,
624            self.tightening_ratio,
625        )
626    }
627}
628
629#[cfg(test)]
630mod growable_bloom_tests {
631    mod test_bloom {
632        use crate::{double_hashing_hashes, Bloom};
633
634        #[test]
635        fn can_insert_bloom() {
636            let mut b = Bloom::new(100, 0.01);
637            let (h1, h2) = double_hashing_hashes(123);
638            b.insert(h1, h2);
639            assert!(b.contains(h1, h2))
640        }
641
642        #[test]
643        fn can_insert_string_bloom() {
644            let mut b = Bloom::new(100, 0.01);
645            let (h1, h2) = double_hashing_hashes("hello world".to_string());
646            b.insert(h1, h2);
647            assert!(b.contains(h1, h2))
648        }
649
650        #[test]
651        fn does_not_contain() {
652            let mut b = Bloom::new(100, 0.01);
653            let upper = 100;
654            for i in (0..upper).step_by(2) {
655                let (h1, h2) = double_hashing_hashes(i);
656                b.insert(h1, h2);
657                assert!(b.contains(h1, h2))
658            }
659            for i in (1..upper).step_by(2) {
660                let (h1, h2) = double_hashing_hashes(i);
661                assert!(!b.contains(h1, h2))
662            }
663        }
664        #[test]
665        fn can_insert_lots() {
666            let mut b = Bloom::new(100, 0.01);
667            for i in 0..1024 {
668                let (h1, h2) = double_hashing_hashes(i);
669                b.insert(h1, h2);
670                assert!(b.contains(h1, h2))
671            }
672        }
673        #[test]
674        fn test_refs() {
675            let item = String::from("Hello World");
676            let mut b = Bloom::new(100, 0.01);
677            let (h1, h2) = double_hashing_hashes(&item);
678            b.insert(h1, h2);
679            assert!(b.contains(h1, h2))
680        }
681    }
682
683    mod test_growable {
684        use crate::{GrowableBloom, DEFAULT_TIGHTENING_RATIO};
685        use serde_json;
686
687        #[test]
688        fn can_insert() {
689            let mut b = GrowableBloom::new(0.05, 1000);
690            let item = 20;
691            b.insert(&item);
692            assert!(b.contains(&item))
693        }
694
695        #[test]
696        fn len_capacity_clear() {
697            let mut b = GrowableBloom::new(0.05, 100);
698            assert_eq!(b.len(), 0);
699            assert_eq!(b.capacity(), 0);
700
701            let item = 20;
702            b.insert(&item);
703            assert_ne!(b.len(), 0);
704            assert_ne!(b.capacity(), 0);
705
706            b.clear();
707            assert_eq!(b.len(), 0);
708            assert_eq!(b.capacity(), 0);
709        }
710
711        #[test]
712        fn ensure_capacity() {
713            let mut b = GrowableBloom::new(0.05, 1);
714            assert_eq!(b.capacity(), 0);
715            b.insert("abc");
716            assert_eq!(b.capacity(), 1);
717            for i in 0..100 {
718                b.insert(i);
719            }
720            assert_eq!(b.capacity(), 127);
721        }
722
723        #[test]
724        fn can_insert_string() {
725            let mut b = GrowableBloom::new(0.05, 1000);
726            let item: String = "hello world".to_owned();
727            b.insert(&item);
728            assert!(b.contains(&item))
729        }
730
731        #[test]
732        fn does_not_contain() {
733            let mut b = GrowableBloom::new(0.05, 1000);
734            assert_eq!(b.contains(&"hello"), false);
735            b.insert(&0);
736            assert_eq!(b.contains(&"hello"), false);
737            b.insert(&1);
738            assert_eq!(b.contains(&"hello"), false);
739            b.insert(&2);
740            assert_eq!(b.contains(&"hello"), false);
741        }
742
743        #[test]
744        fn can_insert_a_lot_of_elements() {
745            let mut b = GrowableBloom::new(0.05, 1000);
746            for i in 0..1000 {
747                b.insert(&i);
748                assert!(b.contains(&i));
749            }
750        }
751
752        #[test]
753        fn can_serialize_deserialize() {
754            let mut b = GrowableBloom::new(0.05, 1000);
755            b.insert(&0);
756            let s = serde_json::to_string(&b).unwrap();
757            let b_s: GrowableBloom = serde_json::from_str(&s).unwrap();
758            assert!(b_s.contains(&0));
759            assert_ne!(b_s.contains(&1), true);
760            assert_ne!(b_s.contains(&1000), true);
761        }
762
763        #[test]
764        fn verify_saturation() {
765            for &fp in &[0.01, 0.001] {
766                // The paper gives an upper bound formula for the fp rate: fpUB <= fp0*/(1-r)
767                let fp_ub = fp / (1.0 - DEFAULT_TIGHTENING_RATIO);
768                let initial_cap = 100u64;
769                let growth = 1000u64;
770                let mut b = GrowableBloom::new(fp, initial_cap as usize);
771                // insert 1000x more elements than initially allocated
772                for i in 1u64..=initial_cap * growth {
773                    b.insert(&i);
774
775                    if i % (initial_cap * growth / 10) == 0
776                        || [1, 2, 5, 10, 25].iter().any(|&g| i == initial_cap * g)
777                    {
778                        // A lot of tests are required to get a good estimate
779                        let est_fp_rate = (i + 1..).take(50_000).filter(|i| b.contains(i)).count()
780                            as f64
781                            / 50_000.0;
782
783                        // Uncomment the following to get good output for experiments
784                        // println!(
785                        //     "{}x cap: {}fp ({}x)",
786                        //     i / initial_cap,
787                        //     est_fp_rate,
788                        //     est_fp_rate / fp
789                        // );
790                        assert!(est_fp_rate <= fp_ub);
791                    }
792                }
793                for i in 1u64..=initial_cap * growth {
794                    assert!(b.contains(&i));
795                }
796            }
797        }
798
799        #[test]
800        fn test_types_saturation() {
801            let mut b = GrowableBloom::new(0.50, 100);
802            b.insert(&vec![1, 2, 3]);
803            b.insert("hello");
804            b.insert(&-1);
805            b.insert(&0);
806        }
807
808        #[test]
809        fn can_check_and_set() {
810            let mut b = GrowableBloom::new(0.05, 1000);
811            let item = 20;
812            assert!(!b.check_and_set(&item));
813            assert!(b.check_and_set(&item));
814        }
815    }
816
817    mod test_builder {
818        use crate::GrowableBloomBuilder;
819
820        #[test]
821        fn can_build_bloom() {
822            let mut gbloom = GrowableBloomBuilder::new().build();
823            gbloom.insert(3);
824            assert!(gbloom.contains(&3));
825        }
826        #[test]
827        #[should_panic]
828        fn should_panic_on_bad_error_ratio() {
829            GrowableBloomBuilder::new()
830                .estimated_insertions(1000)
831                .desired_error_ratio(99.9)
832                .build();
833        }
834        #[test]
835        #[should_panic]
836        fn should_panic_on_too_small_tightening_ratio() {
837            GrowableBloomBuilder::new().tightening_ratio(0.0).build();
838        }
839        #[test]
840        #[should_panic]
841        fn should_panic_on_too_large_tightening_ratio() {
842            GrowableBloomBuilder::new().tightening_ratio(10.0).build();
843        }
844        #[test]
845        fn can_specify_all_values() {
846            // From https://github.com/dpbriggs/growable-bloom-filters/issues/7
847            let mut gbloom = GrowableBloomBuilder::new()
848                .estimated_insertions(3)
849                .desired_error_ratio(0.00001)
850                .tightening_ratio(0.5)
851                .growth_factor(2)
852                .build();
853            for i in 0..100 {
854                gbloom.insert(i);
855            }
856            for i in 0..100 {
857                assert!(gbloom.contains(&i));
858            }
859        }
860    }
861
862    #[cfg(feature = "nightly")]
863    mod bench {
864        use crate::GrowableBloom;
865        use test::Bencher;
866        #[bench]
867        fn bench_new(b: &mut Bencher) {
868            b.iter(|| GrowableBloom::new(0.01, 1000));
869        }
870        #[bench]
871        fn bench_insert_normal_prob(b: &mut Bencher) {
872            let mut gbloom = GrowableBloom::new(0.01, 1000);
873            b.iter(|| gbloom.insert(10));
874        }
875        #[bench]
876        fn bench_insert_small_prob(b: &mut Bencher) {
877            let mut gbloom = GrowableBloom::new(0.001, 1000);
878            b.iter(|| gbloom.insert(10));
879        }
880        #[bench]
881        fn bench_many(b: &mut Bencher) {
882            let mut gbloom = GrowableBloom::new(0.01, 100000);
883            b.iter(|| gbloom.insert(10));
884        }
885        #[bench]
886        fn bench_insert_medium(b: &mut Bencher) {
887            let s: String = (0..100).map(|_| 'X').collect();
888            let mut gbloom = GrowableBloom::new(0.01, 100000);
889            b.iter(|| gbloom.insert(&s))
890        }
891        #[bench]
892        fn bench_insert_large(b: &mut Bencher) {
893            let s: String = (0..10000).map(|_| 'X').collect();
894            let mut gbloom = GrowableBloom::new(0.01, 100000);
895            b.iter(|| gbloom.insert(&s))
896        }
897        #[bench]
898        fn bench_insert_large_very_small_prob(b: &mut Bencher) {
899            let s: String = (0..10000).map(|_| 'X').collect();
900            let mut gbloom = GrowableBloom::new(0.0001, 100000);
901            b.iter(|| gbloom.insert(&s))
902        }
903        #[bench]
904        fn bench_grow(b: &mut Bencher) {
905            b.iter(|| {
906                let mut gbloom = GrowableBloom::new(0.01, 100);
907                for i in 0..1000 {
908                    gbloom.insert(&i);
909                    assert!(gbloom.contains(&i));
910                }
911            })
912        }
913    }
914}
growable_bloom_filter/lib.rs

growable_bloom_filter/
lib.rs