redb/tree_store/page_store/
buddy_allocator.rs

1use crate::tree_store::PageNumber;
2use crate::tree_store::page_store::bitmap::{BtreeBitmap, U64GroupedBitmap};
3use crate::tree_store::page_store::page_manager::MAX_MAX_PAGE_ORDER;
4use std::cmp::min;
5#[cfg(test)]
6use std::collections::HashSet;
7use std::mem::size_of;
8
9const MAX_ORDER_OFFSET: usize = 0;
10const PADDING: usize = 3;
11const NUM_PAGES_OFFSET: usize = MAX_ORDER_OFFSET + size_of::<u8>() + PADDING;
12const FREE_END_OFFSETS: usize = NUM_PAGES_OFFSET + size_of::<u32>();
13
14fn calculate_usable_order(pages: u32) -> u8 {
15    let max_order = (32 - pages.leading_zeros() - 1).try_into().unwrap();
16    min(MAX_MAX_PAGE_ORDER, max_order)
17}
18
19fn next_higher_order(page_number: u32) -> u32 {
20    page_number / 2
21}
22
23fn buddy_page(page_number: u32) -> u32 {
24    page_number ^ 1
25}
26
27// Handles allocation of dynamically sized pages, supports pages of up to page_size * 2^max_order bytes
28//
29// Pages are marked free at only a single order, and it must always be the largest order
30pub(crate) struct BuddyAllocator {
31    allocated: Vec<U64GroupedBitmap>,
32    free: Vec<BtreeBitmap>,
33    len: u32,
34    max_order: u8,
35}
36
37impl BuddyAllocator {
38    pub(crate) fn new(num_pages: u32, max_page_capacity: u32) -> Self {
39        let max_order = calculate_usable_order(max_page_capacity);
40
41        let mut pages_for_order = max_page_capacity;
42        let mut free = vec![];
43        let mut allocated = vec![];
44        for _ in 0..=max_order {
45            free.push(BtreeBitmap::new(pages_for_order));
46
47            allocated.push(U64GroupedBitmap::new_empty(
48                pages_for_order,
49                pages_for_order,
50            ));
51            pages_for_order = next_higher_order(pages_for_order);
52        }
53
54        // Mark the available pages, starting with the highest order
55        let mut accounted_pages = 0;
56        for order in (0..=max_order).rev() {
57            let order_size = 2u32.pow(order.into());
58            while accounted_pages + order_size <= num_pages {
59                let page = accounted_pages / order_size;
60                free[order as usize].clear(page);
61                accounted_pages += order_size;
62            }
63        }
64        assert_eq!(accounted_pages, num_pages);
65
66        Self {
67            allocated,
68            free,
69            len: num_pages,
70            max_order,
71        }
72    }
73
74    // Data structure format:
75    // max_order: u8
76    // padding: 3 bytes
77    // num_pages: u32
78    // free_ends: array of u32, with ending offset for BtreeBitmap structure for the given order
79    // allocated_ends: array of u32, with ending offset for U64GroupedBitmap structure for the given order
80    // ... BtreeBitmap structures
81    // ... U64GroupedBitmap structures
82    pub(crate) fn to_vec(&self) -> Vec<u8> {
83        let mut result = vec![];
84        result.push(self.max_order);
85        result.extend([0u8; 3]);
86        result.extend(self.len.to_le_bytes());
87
88        let mut data_offset = result.len() + (self.max_order as usize + 1) * 2 * size_of::<u32>();
89        let end_metadata = data_offset;
90        for order in &self.free {
91            data_offset += order.to_vec().len();
92            let offset_u32: u32 = data_offset.try_into().unwrap();
93            result.extend(offset_u32.to_le_bytes());
94        }
95        for order in &self.allocated {
96            data_offset += order.to_vec().len();
97            let offset_u32: u32 = data_offset.try_into().unwrap();
98            result.extend(offset_u32.to_le_bytes());
99        }
100        assert_eq!(end_metadata, result.len());
101        for order in &self.free {
102            result.extend(&order.to_vec());
103        }
104        for order in &self.allocated {
105            result.extend(&order.to_vec());
106        }
107
108        result
109    }
110
111    pub(crate) fn from_bytes(data: &[u8]) -> Self {
112        let max_order = data[MAX_ORDER_OFFSET];
113        let num_pages = u32::from_le_bytes(
114            data[NUM_PAGES_OFFSET..(NUM_PAGES_OFFSET + size_of::<u32>())]
115                .try_into()
116                .unwrap(),
117        );
118
119        let mut metadata = FREE_END_OFFSETS;
120        let mut data_start = FREE_END_OFFSETS + (max_order as usize + 1) * 2 * size_of::<u32>();
121
122        let mut free = vec![];
123        for _ in 0..=max_order {
124            let data_end = u32::from_le_bytes(
125                data[metadata..(metadata + size_of::<u32>())]
126                    .try_into()
127                    .unwrap(),
128            ) as usize;
129            free.push(BtreeBitmap::from_bytes(&data[data_start..data_end]));
130            data_start = data_end;
131            metadata += size_of::<u32>();
132        }
133        let mut allocated = vec![];
134        for _ in 0..=max_order {
135            let data_end = u32::from_le_bytes(
136                data[metadata..(metadata + size_of::<u32>())]
137                    .try_into()
138                    .unwrap(),
139            ) as usize;
140            allocated.push(U64GroupedBitmap::from_bytes(&data[data_start..data_end]));
141            data_start = data_end;
142            metadata += size_of::<u32>();
143        }
144
145        Self {
146            allocated,
147            free,
148            len: num_pages,
149            max_order,
150        }
151    }
152
153    #[inline]
154    pub(crate) fn highest_free_order(&self) -> Option<u8> {
155        (0..=self.max_order)
156            .rev()
157            .find(|order| self.get_order_free(*order).has_unset())
158    }
159
160    pub(crate) fn count_allocated_pages(&self) -> u32 {
161        self.len() - self.count_free_pages()
162    }
163
164    pub(crate) fn count_free_pages(&self) -> u32 {
165        let mut pages = 0;
166        for order in 0..=self.max_order {
167            pages += self.get_order_free(order).count_unset() * 2u32.pow(order.into());
168        }
169        pages
170    }
171
172    pub(crate) fn capacity(&self) -> u32 {
173        self.get_order_free(0).capacity()
174    }
175
176    pub(crate) fn get_max_order(&self) -> u8 {
177        self.max_order
178    }
179
180    fn find_free_order(&self, mut page: u32) -> Option<u8> {
181        for order in 0..=self.max_order {
182            if !self.get_order_free(order).get(page) {
183                return Some(order);
184            }
185            page = next_higher_order(page);
186        }
187        None
188    }
189
190    pub(crate) fn trailing_free_pages(&self) -> u32 {
191        let mut free_pages = 0;
192        let mut next_page = self.len() - 1;
193        while let Some(order) = self.find_free_order(next_page) {
194            let order_size = 2u32.pow(order.into());
195            free_pages += order_size;
196            if order_size > next_page {
197                break;
198            }
199            next_page -= order_size;
200        }
201
202        free_pages
203    }
204
205    // Inverse of make_state_for_savepoint()
206    pub(crate) fn from_savepoint_state(data: &[u8]) -> Self {
207        let mut offset = 0;
208        let max_order = data[offset];
209        offset += 1;
210        let len = u32::from_le_bytes(
211            data[offset..(offset + size_of::<u32>())]
212                .try_into()
213                .unwrap(),
214        );
215        offset += size_of::<u32>();
216
217        let mut data_start = offset + size_of::<u32>() * (max_order as usize + 1);
218        let mut allocated_sets = vec![];
219        for _ in 0..=max_order {
220            let data_end = u32::from_le_bytes(
221                data[offset..(offset + size_of::<u32>())]
222                    .try_into()
223                    .unwrap(),
224            ) as usize;
225            offset += size_of::<u32>();
226            allocated_sets.push(U64GroupedBitmap::from_bytes(&data[data_start..data_end]));
227            data_start = data_end;
228        }
229        assert_eq!(data_start, data.len());
230
231        let mut result = Self::new(len, allocated_sets[0].capacity());
232
233        for (order, allocated) in allocated_sets.iter().enumerate() {
234            for page_number in allocated.iter() {
235                result.record_alloc(page_number, order.try_into().unwrap());
236            }
237        }
238
239        result
240    }
241
242    // Reduced state for savepoint, which includes only the list of allocated pages
243    // Format:
244    // 1 byte: max order
245    // 4 bytes: num pages
246    // 4 * (max order + 1) bytes: end offsets
247    // TODO: maybe this should return a Vec<U64GroupedBitmap>?
248    pub(crate) fn make_state_for_savepoint(&self) -> Vec<u8> {
249        let mut result = vec![self.max_order];
250        result.extend(self.len().to_le_bytes());
251
252        let mut data_offset = result.len() + size_of::<u32>() * (self.max_order as usize + 1);
253        for order in 0..=self.max_order {
254            data_offset += self.allocated[order as usize].to_vec().len();
255            result.extend(u32::try_from(data_offset).unwrap().to_le_bytes());
256        }
257
258        for order in 0..=self.max_order {
259            result.extend(&self.allocated[order as usize].to_vec());
260        }
261
262        result
263    }
264
265    // Returns all allocated pages in self, for the given region, that are not allocated in `other`
266    pub(crate) fn difference(
267        &self,
268        region: u32,
269        other: &BuddyAllocator,
270        output: &mut Vec<PageNumber>,
271    ) {
272        let num_pages = self.len();
273
274        for order in 0..=self.max_order {
275            let other_allocated = other.get_order_allocated(order);
276            let self_allocated = self.get_order_allocated(order);
277            for i in self_allocated.difference(other_allocated) {
278                if i >= num_pages {
279                    break;
280                }
281                output.push(PageNumber::new(region, i, order));
282            }
283        }
284    }
285
286    pub(crate) fn get_allocated_pages(&self, region: u32, output: &mut Vec<PageNumber>) {
287        for order in 0..=self.max_order {
288            let allocated = self.get_order_allocated(order);
289            for i in allocated.iter() {
290                if i >= self.len() {
291                    break;
292                }
293                output.push(PageNumber::new(region, i, order));
294            }
295        }
296
297        #[cfg(test)]
298        // Check the result against the free index to be sure it matches
299        {
300            let mut allocated_check = HashSet::new();
301
302            for order in 0..=self.max_order {
303                let allocated = self.get_order_allocated(order);
304                for i in allocated.iter() {
305                    if i >= self.len() {
306                        break;
307                    }
308                    allocated_check.insert(PageNumber::new(region, i, order));
309                }
310            }
311
312            let mut free_check = HashSet::new();
313            for i in 0..self.len() {
314                if self.find_free_order(i).is_none() {
315                    free_check.insert(PageNumber::new(region, i, 0));
316                }
317            }
318
319            let mut check_result = HashSet::new();
320            for page in &allocated_check {
321                check_result.extend(page.to_order0());
322            }
323            assert_eq!(free_check, check_result);
324        }
325    }
326
327    pub(crate) fn len(&self) -> u32 {
328        self.len
329    }
330
331    pub(crate) fn resize(&mut self, new_size: u32) {
332        self.debug_check_consistency();
333        assert!(new_size <= self.capacity());
334        if new_size > self.len() {
335            let mut processed_pages = self.len();
336            // Align to the highest order possible
337            while processed_pages < new_size {
338                let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
339                let order_size = 2u32.pow(order.into());
340                let page = processed_pages / order_size;
341                debug_assert_eq!(processed_pages % order_size, 0);
342                if order >= self.max_order || processed_pages + order_size > new_size {
343                    break;
344                }
345                self.free_inner(page, order);
346                processed_pages += order_size;
347            }
348            // Allocate the remaining space, at the highest order
349            for order in (0..=self.max_order).rev() {
350                let order_size = 2u32.pow(order.into());
351                while processed_pages + order_size <= new_size {
352                    let page = processed_pages / order_size;
353                    self.free_inner(page, order);
354                    processed_pages += order_size;
355                }
356            }
357            assert_eq!(processed_pages, new_size);
358            self.debug_check_consistency();
359        } else {
360            let mut processed_pages = new_size;
361            // Align to the highest order possible
362            while processed_pages < self.len() {
363                let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
364                if order >= self.max_order {
365                    break;
366                }
367                let order_size = 2u32.pow(order.into());
368                let page = processed_pages / order_size;
369                debug_assert_eq!(processed_pages % order_size, 0);
370                if processed_pages + order_size > self.len() {
371                    break;
372                }
373                self.record_alloc_inner(page, order);
374                processed_pages += order_size;
375            }
376            // Allocate the remaining space, at the highest order
377            for order in (0..=self.max_order).rev() {
378                let order_size = 2u32.pow(order.into());
379                while processed_pages + order_size <= self.len() {
380                    let page = processed_pages / order_size;
381                    self.record_alloc_inner(page, order);
382                    processed_pages += order_size;
383                }
384            }
385            assert_eq!(processed_pages, self.len());
386        }
387        self.len = new_size;
388    }
389
390    #[allow(unused_variables)]
391    fn debug_check_consistency(&self) {
392        // Don't enable when fuzzing, because this is kind of expensive
393        #[cfg(all(debug_assertions, not(fuzzing)))]
394        {
395            let mut processed = 0;
396            // Ensure that no page is free at multiple orders
397            while processed < self.len() {
398                let mut found = false;
399                let mut page = processed;
400                for order in 0..=self.max_order {
401                    let allocator = &self.free[order as usize];
402                    if !allocator.get(page) {
403                        assert!(!found);
404                        found = true;
405                    }
406                    page = next_higher_order(page);
407                }
408                processed += 1;
409            }
410
411            // Ensure that all buddy pages are merged, except at the highest order
412            for order in (0..self.max_order).rev() {
413                let order_len = self.len() / (2u32.pow(order.into()));
414                let allocator = &self.free[order as usize];
415                for page in 0..order_len {
416                    if !allocator.get(page) {
417                        let buddy = buddy_page(page);
418                        let buddy_allocated = allocator.get(buddy);
419                        assert!(buddy_allocated, "order={order} page={page} buddy={buddy}",);
420                    }
421                }
422            }
423        }
424    }
425
426    // Allocates a page of the given order at the lowest index possible. Will split a higher order page,
427    // to get one of lower index
428    pub(crate) fn alloc_lowest(&mut self, order: u8) -> Option<u32> {
429        let page = self.alloc_lowest_inner(order);
430        if let Some(page_number) = page {
431            debug_assert!(!self.get_order_allocated(order).get(page_number));
432            self.get_order_allocated_mut(order).set(page_number);
433        }
434        page
435    }
436
437    pub(crate) fn alloc_lowest_inner(&mut self, order: u8) -> Option<u32> {
438        // Lowest index at the requested order
439        let mut best_index_at_order = self.alloc_inner(order)?;
440        // Best (index, order) found
441        let mut best = (best_index_at_order, order);
442
443        // Find the lowest index at which we can fill this allocation, across all orders
444        let mut multiplier = 2;
445        for i in (order + 1)..=self.max_order {
446            if let Some(index) = self.alloc_inner(i) {
447                let index_at_order = index * multiplier;
448                if index_at_order < best_index_at_order {
449                    self.free_inner(best.0, best.1);
450                    best_index_at_order = index_at_order;
451                    best = (index, i);
452                } else {
453                    self.free_inner(index, i);
454                }
455            }
456            multiplier *= 2;
457        }
458
459        // Split the page, until we get to the requested order
460        while best.1 > order {
461            let (best_index, best_order) = best;
462            let (free1, free2) = (best_index * 2, best_index * 2 + 1);
463            let allocator = self.get_order_free_mut(best_order - 1);
464            debug_assert!(allocator.get(free1));
465            debug_assert!(allocator.get(free2));
466            allocator.clear(free2);
467            best = (free1, best_order - 1);
468        }
469        assert_eq!(best.0, best_index_at_order);
470
471        Some(best.0)
472    }
473
474    pub(crate) fn alloc(&mut self, order: u8) -> Option<u32> {
475        let page = self.alloc_inner(order);
476        if let Some(page_number) = page {
477            debug_assert!(!self.get_order_allocated(order).get(page_number));
478            self.get_order_allocated_mut(order).set(page_number);
479        }
480        page
481    }
482
483    pub(crate) fn alloc_inner(&mut self, order: u8) -> Option<u32> {
484        if order > self.max_order {
485            return None;
486        }
487        let allocator = self.get_order_free_mut(order);
488        if let Some(x) = allocator.alloc() {
489            Some(x)
490        } else {
491            // Try to allocate a higher order page and split it
492            let upper_page = self.alloc_inner(order + 1)?;
493            let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
494            let allocator = self.get_order_free_mut(order);
495            debug_assert!(allocator.get(free1));
496            debug_assert!(allocator.get(free2));
497            allocator.clear(free2);
498
499            Some(free1)
500        }
501    }
502
503    /// data must have been initialized by `Self::init_new()`, and `page_number` must be free
504    pub(crate) fn record_alloc(&mut self, page_number: u32, order: u8) {
505        assert!(order <= self.max_order);
506        // Only record the allocation for the actual page
507        self.get_order_allocated_mut(order).set(page_number);
508        // Split parent pages as necessary, and update the free index
509        self.record_alloc_inner(page_number, order);
510    }
511
512    pub(crate) fn record_alloc_inner(&mut self, page_number: u32, order: u8) {
513        let allocator = self.get_order_free_mut(order);
514        if allocator.get(page_number) {
515            // Need to split parent page
516            let upper_page = next_higher_order(page_number);
517            self.record_alloc_inner(upper_page, order + 1);
518            let allocator = self.get_order_free_mut(order);
519
520            let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
521            debug_assert!(free1 == page_number || free2 == page_number);
522            if free1 == page_number {
523                allocator.clear(free2);
524            } else {
525                allocator.clear(free1);
526            }
527        } else {
528            allocator.set(page_number);
529        }
530    }
531
532    /// data must have been initialized by `Self::init_new()`
533    pub(crate) fn free(&mut self, page_number: u32, order: u8) {
534        debug_assert!(self.get_order_free_mut(order).get(page_number));
535        debug_assert!(
536            self.get_order_allocated(order).get(page_number),
537            "Attempted to free page {page_number}, order {order}, which is not allocated",
538        );
539
540        self.get_order_allocated_mut(order).clear(page_number);
541
542        // Update the free index and merge free pages
543        self.free_inner(page_number, order);
544    }
545
546    pub(crate) fn free_inner(&mut self, page_number: u32, order: u8) {
547        if order == self.max_order {
548            let allocator = self.get_order_free_mut(order);
549            allocator.clear(page_number);
550            return;
551        }
552
553        let allocator = self.get_order_free_mut(order);
554        let buddy = buddy_page(page_number);
555        if allocator.get(buddy) {
556            allocator.clear(page_number);
557        } else {
558            // Merge into higher order page
559            allocator.set(buddy);
560            self.free_inner(next_higher_order(page_number), order + 1);
561        }
562    }
563
564    pub(crate) fn is_allocated(&self, page_number: u32, order: u8) -> bool {
565        self.get_order_allocated(order).get(page_number)
566    }
567
568    fn get_order_free_mut(&mut self, order: u8) -> &mut BtreeBitmap {
569        &mut self.free[order as usize]
570    }
571
572    fn get_order_allocated_mut(&mut self, order: u8) -> &mut U64GroupedBitmap {
573        &mut self.allocated[order as usize]
574    }
575
576    fn get_order_free(&self, order: u8) -> &BtreeBitmap {
577        &self.free[order as usize]
578    }
579
580    fn get_order_allocated(&self, order: u8) -> &U64GroupedBitmap {
581        &self.allocated[order as usize]
582    }
583}
584
585#[cfg(test)]
586mod test {
587    use crate::tree_store::page_store::buddy_allocator::BuddyAllocator;
588
589    #[test]
590    fn record_alloc_buddy() {
591        let num_pages = 256;
592        let mut allocator = BuddyAllocator::new(num_pages, num_pages);
593        assert_eq!(allocator.count_allocated_pages(), 0);
594
595        for page in 0..num_pages {
596            allocator.record_alloc(page, 0);
597        }
598        assert_eq!(allocator.count_allocated_pages(), num_pages);
599
600        assert!(allocator.alloc(0).is_none());
601
602        for page in 0..num_pages {
603            allocator.free(page, 0);
604        }
605        assert_eq!(allocator.count_allocated_pages(), 0);
606    }
607
608    #[test]
609    fn buddy_merge() {
610        let num_pages = 256;
611        let mut allocator = BuddyAllocator::new(num_pages, num_pages);
612        assert_eq!(allocator.count_allocated_pages(), 0);
613
614        for _ in 0..num_pages {
615            allocator.alloc(0).unwrap();
616        }
617        for page in 0..num_pages {
618            allocator.free(page, 0);
619        }
620        assert_eq!(allocator.count_allocated_pages(), 0);
621
622        // Test that everything got merged back together, so that we fill order 7 allocations
623        for _ in 0..(num_pages / 2u32.pow(7)) {
624            allocator.alloc(7).unwrap();
625        }
626    }
627
628    #[test]
629    fn alloc_large() {
630        let num_pages = 256;
631        let max_order = 7;
632        let mut allocator = BuddyAllocator::new(num_pages, num_pages);
633        assert_eq!(allocator.count_allocated_pages(), 0);
634
635        let mut allocated = vec![];
636        for order in 0..=max_order {
637            allocated.push((allocator.alloc(order).unwrap(), order));
638        }
639        assert_eq!(allocator.count_allocated_pages(), num_pages - 1);
640
641        for order in 1..=max_order {
642            assert!(allocator.alloc(order).is_none());
643        }
644
645        for (page, order) in allocated {
646            allocator.free(page, order);
647        }
648        assert_eq!(allocator.count_allocated_pages(), 0);
649    }
650
651    #[test]
652    fn make_savepoint_state() {
653        let num_pages = 256;
654        let mut allocator = BuddyAllocator::new(num_pages, num_pages);
655
656        // Allocate some arbitrary stuff
657        allocator.record_alloc(7, 0);
658        allocator.alloc(0);
659        allocator.alloc(1);
660        allocator.alloc(3);
661        allocator.alloc(0);
662        allocator.alloc(0);
663
664        let allocator2 =
665            BuddyAllocator::from_savepoint_state(&allocator.make_state_for_savepoint());
666        assert_eq!(allocator.to_vec(), allocator2.to_vec());
667    }
668
669    #[test]
670    fn serialized_size() {
671        // Check that serialized size is as expected for a full region
672        let max_region_pages = 1024 * 1024;
673        let allocator = BuddyAllocator::new(max_region_pages, max_region_pages);
674        let max_region_pages = u64::from(max_region_pages);
675        // 2x because that's the integral of 1/2^x to account for all the 21 orders
676        let allocated_state_bits = 2 * max_region_pages;
677        // + u32 * 21 because of the length field
678        let allocated_state_bytes = allocated_state_bits / 8 + 4 * 21;
679
680        // Add 2/64 to account for the intermediate index of the bitmap tree
681        let free_state_bits = allocated_state_bits + allocated_state_bits * 2 / 64;
682        // Add u32s to account for the stored offsets. Might be a bit of an over estimate
683        let free_state_bytes = free_state_bits / 8 + 4 * 21 * 3 + 21 * 4;
684        // BuddyAllocator overhead
685        let buddy_state_bytes = free_state_bytes + allocated_state_bytes + 21 * 2 * 4 + 8;
686
687        assert!((allocator.to_vec().len() as u64) <= buddy_state_bytes);
688    }
689}