redb/tree_store/page_store/
buddy_allocator.rs

1use crate::tree_store::page_store::bitmap::{BtreeBitmap, U64GroupedBitmap};
2use crate::tree_store::page_store::page_manager::MAX_MAX_PAGE_ORDER;
3use crate::tree_store::PageNumber;
4use std::cmp::min;
5#[cfg(test)]
6use std::collections::HashSet;
7use std::mem::size_of;
8
9const MAX_ORDER_OFFSET: usize = 0;
10const PADDING: usize = 3;
11const NUM_PAGES_OFFSET: usize = MAX_ORDER_OFFSET + size_of::<u8>() + PADDING;
12const FREE_END_OFFSETS: usize = NUM_PAGES_OFFSET + size_of::<u32>();
13
14fn calculate_usable_order(pages: u32) -> u8 {
15    let max_order = (32 - pages.leading_zeros() - 1).try_into().unwrap();
16    min(MAX_MAX_PAGE_ORDER, max_order)
17}
18
19fn next_higher_order(page_number: u32) -> u32 {
20    page_number / 2
21}
22
23fn buddy_page(page_number: u32) -> u32 {
24    page_number ^ 1
25}
26
27// Handles allocation of dynamically sized pages, supports pages of up to page_size * 2^max_order bytes
28//
29// Pages are marked free at only a single order, and it must always be the largest order
30pub(crate) struct BuddyAllocator {
31    allocated: Vec<U64GroupedBitmap>,
32    free: Vec<BtreeBitmap>,
33    len: u32,
34    max_order: u8,
35}
36
37impl BuddyAllocator {
38    pub(crate) fn new(num_pages: u32, max_page_capacity: u32) -> Self {
39        let max_order = calculate_usable_order(max_page_capacity);
40
41        let mut pages_for_order = max_page_capacity;
42        let mut free = vec![];
43        let mut allocated = vec![];
44        for _ in 0..=max_order {
45            free.push(BtreeBitmap::new(pages_for_order));
46
47            allocated.push(U64GroupedBitmap::new_empty(
48                pages_for_order,
49                pages_for_order,
50            ));
51            pages_for_order = next_higher_order(pages_for_order);
52        }
53
54        // Mark the available pages, starting with the highest order
55        let mut accounted_pages = 0;
56        for order in (0..=max_order).rev() {
57            let order_size = 2u32.pow(order.into());
58            while accounted_pages + order_size <= num_pages {
59                let page = accounted_pages / order_size;
60                free[order as usize].clear(page);
61                accounted_pages += order_size;
62            }
63        }
64        assert_eq!(accounted_pages, num_pages);
65
66        Self {
67            allocated,
68            free,
69            len: num_pages,
70            max_order,
71        }
72    }
73
74    // Data structure format:
75    // max_order: u8
76    // padding: 3 bytes
77    // num_pages: u32
78    // free_ends: array of u32, with ending offset for BtreeBitmap structure for the given order
79    // allocated_ends: array of u32, with ending offset for U64GroupedBitmap structure for the given order
80    // ... BtreeBitmap structures
81    // ... U64GroupedBitmap structures
82    pub(crate) fn to_vec(&self) -> Vec<u8> {
83        let mut result = vec![];
84        result.push(self.max_order);
85        result.extend([0u8; 3]);
86        result.extend(self.len.to_le_bytes());
87
88        let mut data_offset = result.len() + (self.max_order as usize + 1) * 2 * size_of::<u32>();
89        let end_metadata = data_offset;
90        for order in &self.free {
91            data_offset += order.to_vec().len();
92            let offset_u32: u32 = data_offset.try_into().unwrap();
93            result.extend(offset_u32.to_le_bytes());
94        }
95        for order in &self.allocated {
96            data_offset += order.to_vec().len();
97            let offset_u32: u32 = data_offset.try_into().unwrap();
98            result.extend(offset_u32.to_le_bytes());
99        }
100        assert_eq!(end_metadata, result.len());
101        for order in &self.free {
102            result.extend(&order.to_vec());
103        }
104        for order in &self.allocated {
105            result.extend(&order.to_vec());
106        }
107
108        result
109    }
110
111    pub(crate) fn from_bytes(data: &[u8]) -> Self {
112        let max_order = data[MAX_ORDER_OFFSET];
113        let num_pages = u32::from_le_bytes(
114            data[NUM_PAGES_OFFSET..(NUM_PAGES_OFFSET + size_of::<u32>())]
115                .try_into()
116                .unwrap(),
117        );
118
119        let mut metadata = FREE_END_OFFSETS;
120        let mut data_start = FREE_END_OFFSETS + (max_order as usize + 1) * 2 * size_of::<u32>();
121
122        let mut free = vec![];
123        for _ in 0..=max_order {
124            let data_end = u32::from_le_bytes(
125                data[metadata..(metadata + size_of::<u32>())]
126                    .try_into()
127                    .unwrap(),
128            ) as usize;
129            free.push(BtreeBitmap::from_bytes(&data[data_start..data_end]));
130            data_start = data_end;
131            metadata += size_of::<u32>();
132        }
133        let mut allocated = vec![];
134        for _ in 0..=max_order {
135            let data_end = u32::from_le_bytes(
136                data[metadata..(metadata + size_of::<u32>())]
137                    .try_into()
138                    .unwrap(),
139            ) as usize;
140            allocated.push(U64GroupedBitmap::from_bytes(&data[data_start..data_end]));
141            data_start = data_end;
142            metadata += size_of::<u32>();
143        }
144
145        Self {
146            allocated,
147            free,
148            len: num_pages,
149            max_order,
150        }
151    }
152
153    #[inline]
154    pub(crate) fn highest_free_order(&self) -> Option<u8> {
155        (0..=self.max_order)
156            .rev()
157            .find(|order| self.get_order_free(*order).has_unset())
158    }
159
160    pub(crate) fn count_allocated_pages(&self) -> u32 {
161        self.len() - self.count_free_pages()
162    }
163
164    pub(crate) fn count_free_pages(&self) -> u32 {
165        let mut pages = 0;
166        for order in 0..=self.max_order {
167            pages += self.get_order_free(order).count_unset() * 2u32.pow(order.into());
168        }
169        pages
170    }
171
172    pub(crate) fn capacity(&self) -> u32 {
173        self.get_order_free(0).capacity()
174    }
175
176    pub(crate) fn get_max_order(&self) -> u8 {
177        self.max_order
178    }
179
180    fn find_free_order(&self, mut page: u32) -> Option<u8> {
181        for order in 0..=self.max_order {
182            if !self.get_order_free(order).get(page) {
183                return Some(order);
184            }
185            page = next_higher_order(page);
186        }
187        None
188    }
189
190    pub(crate) fn trailing_free_pages(&self) -> u32 {
191        let mut free_pages = 0;
192        let mut next_page = self.len() - 1;
193        while let Some(order) = self.find_free_order(next_page) {
194            let order_size = 2u32.pow(order.into());
195            free_pages += order_size;
196            if order_size > next_page {
197                break;
198            }
199            next_page -= order_size;
200        }
201
202        free_pages
203    }
204
205    // Reduced state for savepoint, which includes only the list of allocated pages
206    // Format:
207    // 1 byte: max order
208    // 4 bytes: num pages
209    // 4 * (max order + 1) bytes: end offsets
210    // TODO: maybe this should return a Vec<U64GroupedBitmap>?
211    pub(crate) fn make_state_for_savepoint(&self) -> Vec<u8> {
212        let mut result = vec![self.max_order];
213        result.extend(self.len().to_le_bytes());
214
215        let mut data_offset = result.len() + size_of::<u32>() * (self.max_order as usize + 1);
216        for order in 0..=self.max_order {
217            data_offset += self.allocated[order as usize].to_vec().len();
218            result.extend(u32::try_from(data_offset).unwrap().to_le_bytes());
219        }
220
221        for order in 0..=self.max_order {
222            result.extend(&self.allocated[order as usize].to_vec());
223        }
224
225        result
226    }
227
228    pub(crate) fn difference(
229        &self,
230        region: u32,
231        other: &BuddyAllocator,
232        output: &mut Vec<PageNumber>,
233    ) {
234        let num_pages = other.len();
235
236        for order in 0..=self.max_order {
237            let other_allocated = other.get_order_allocated(order);
238            let self_allocated = self.get_order_allocated(order);
239            for i in self_allocated.difference(other_allocated) {
240                if i >= num_pages {
241                    break;
242                }
243                output.push(PageNumber::new(region, i, order));
244            }
245        }
246    }
247
248    #[cfg(any(test, fuzzing))]
249    pub(crate) fn get_allocated_pages(&self, region: u32, output: &mut Vec<PageNumber>) {
250        for order in 0..=self.max_order {
251            let allocated = self.get_order_allocated(order);
252            for i in allocated.iter() {
253                if i >= self.len() {
254                    break;
255                }
256                output.push(PageNumber::new(region, i, order));
257            }
258        }
259
260        #[cfg(test)]
261        // Check the result against the free index to be sure it matches
262        {
263            let mut allocated_check = HashSet::new();
264
265            for order in 0..=self.max_order {
266                let allocated = self.get_order_allocated(order);
267                for i in allocated.iter() {
268                    if i >= self.len() {
269                        break;
270                    }
271                    allocated_check.insert(PageNumber::new(region, i, order));
272                }
273            }
274
275            let mut free_check = HashSet::new();
276            for i in 0..self.len() {
277                if self.find_free_order(i).is_none() {
278                    free_check.insert(PageNumber::new(region, i, 0));
279                }
280            }
281
282            let mut check_result = HashSet::new();
283            for page in &allocated_check {
284                check_result.extend(page.to_order0());
285            }
286            assert_eq!(free_check, check_result);
287        }
288    }
289
290    pub(crate) fn len(&self) -> u32 {
291        self.len
292    }
293
294    pub(crate) fn resize(&mut self, new_size: u32) {
295        self.debug_check_consistency();
296        assert!(new_size <= self.capacity());
297        if new_size > self.len() {
298            let mut processed_pages = self.len();
299            // Align to the highest order possible
300            while processed_pages < new_size {
301                let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
302                let order_size = 2u32.pow(order.into());
303                let page = processed_pages / order_size;
304                debug_assert_eq!(processed_pages % order_size, 0);
305                if order >= self.max_order || processed_pages + order_size > new_size {
306                    break;
307                }
308                self.free_inner(page, order);
309                processed_pages += order_size;
310            }
311            // Allocate the remaining space, at the highest order
312            for order in (0..=self.max_order).rev() {
313                let order_size = 2u32.pow(order.into());
314                while processed_pages + order_size <= new_size {
315                    let page = processed_pages / order_size;
316                    self.free_inner(page, order);
317                    processed_pages += order_size;
318                }
319            }
320            assert_eq!(processed_pages, new_size);
321            self.debug_check_consistency();
322        } else {
323            let mut processed_pages = new_size;
324            // Align to the highest order possible
325            while processed_pages < self.len() {
326                let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
327                if order >= self.max_order {
328                    break;
329                }
330                let order_size = 2u32.pow(order.into());
331                let page = processed_pages / order_size;
332                debug_assert_eq!(processed_pages % order_size, 0);
333                if processed_pages + order_size > self.len() {
334                    break;
335                }
336                self.record_alloc_inner(page, order);
337                processed_pages += order_size;
338            }
339            // Allocate the remaining space, at the highest order
340            for order in (0..=self.max_order).rev() {
341                let order_size = 2u32.pow(order.into());
342                while processed_pages + order_size <= self.len() {
343                    let page = processed_pages / order_size;
344                    self.record_alloc_inner(page, order);
345                    processed_pages += order_size;
346                }
347            }
348            assert_eq!(processed_pages, self.len());
349        }
350        self.len = new_size;
351    }
352
353    #[allow(unused_variables)]
354    fn debug_check_consistency(&self) {
355        // Don't enable when fuzzing, because this is kind of expensive
356        #[cfg(all(debug_assertions, not(fuzzing)))]
357        {
358            let mut processed = 0;
359            // Ensure that no page is free at multiple orders
360            while processed < self.len() {
361                let mut found = false;
362                let mut page = processed;
363                for order in 0..=self.max_order {
364                    let allocator = &self.free[order as usize];
365                    if !allocator.get(page) {
366                        assert!(!found);
367                        found = true;
368                    }
369                    page = next_higher_order(page);
370                }
371                processed += 1;
372            }
373
374            // Ensure that all buddy pages are merged, except at the highest order
375            for order in (0..self.max_order).rev() {
376                let order_len = self.len() / (2u32.pow(order.into()));
377                let allocator = &self.free[order as usize];
378                for page in 0..order_len {
379                    if !allocator.get(page) {
380                        let buddy = buddy_page(page);
381                        let buddy_allocated = allocator.get(buddy);
382                        assert!(buddy_allocated, "order={order} page={page} buddy={buddy}",);
383                    }
384                }
385            }
386        }
387    }
388
389    // Allocates a page of the given order at the lowest index possible. Will split a higher order page,
390    // to get one of lower index
391    pub(crate) fn alloc_lowest(&mut self, order: u8) -> Option<u32> {
392        let page = self.alloc_lowest_inner(order);
393        if let Some(page_number) = page {
394            debug_assert!(!self.get_order_allocated(order).get(page_number));
395            self.get_order_allocated_mut(order).set(page_number);
396        }
397        page
398    }
399
400    pub(crate) fn alloc_lowest_inner(&mut self, order: u8) -> Option<u32> {
401        // Lowest index at the requested order
402        let mut best_index_at_order = self.alloc_inner(order)?;
403        // Best (index, order) found
404        let mut best = (best_index_at_order, order);
405
406        // Find the lowest index at which we can fill this allocation, across all orders
407        let mut multiplier = 2;
408        for i in (order + 1)..=self.max_order {
409            if let Some(index) = self.alloc_inner(i) {
410                let index_at_order = index * multiplier;
411                if index_at_order < best_index_at_order {
412                    self.free_inner(best.0, best.1);
413                    best_index_at_order = index_at_order;
414                    best = (index, i);
415                } else {
416                    self.free_inner(index, i);
417                }
418            }
419            multiplier *= 2;
420        }
421
422        // Split the page, until we get to the requested order
423        while best.1 > order {
424            let (best_index, best_order) = best;
425            let (free1, free2) = (best_index * 2, best_index * 2 + 1);
426            let allocator = self.get_order_free_mut(best_order - 1);
427            debug_assert!(allocator.get(free1));
428            debug_assert!(allocator.get(free2));
429            allocator.clear(free2);
430            best = (free1, best_order - 1);
431        }
432        assert_eq!(best.0, best_index_at_order);
433
434        Some(best.0)
435    }
436
437    pub(crate) fn alloc(&mut self, order: u8) -> Option<u32> {
438        let page = self.alloc_inner(order);
439        if let Some(page_number) = page {
440            debug_assert!(!self.get_order_allocated(order).get(page_number));
441            self.get_order_allocated_mut(order).set(page_number);
442        }
443        page
444    }
445
446    pub(crate) fn alloc_inner(&mut self, order: u8) -> Option<u32> {
447        if order > self.max_order {
448            return None;
449        }
450        let allocator = self.get_order_free_mut(order);
451        if let Some(x) = allocator.alloc() {
452            Some(x)
453        } else {
454            // Try to allocate a higher order page and split it
455            let upper_page = self.alloc_inner(order + 1)?;
456            let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
457            let allocator = self.get_order_free_mut(order);
458            debug_assert!(allocator.get(free1));
459            debug_assert!(allocator.get(free2));
460            allocator.clear(free2);
461
462            Some(free1)
463        }
464    }
465
466    /// data must have been initialized by `Self::init_new()`, and `page_number` must be free
467    pub(crate) fn record_alloc(&mut self, page_number: u32, order: u8) {
468        assert!(order <= self.max_order);
469        // Only record the allocation for the actual page
470        self.get_order_allocated_mut(order).set(page_number);
471        // Split parent pages as necessary, and update the free index
472        self.record_alloc_inner(page_number, order);
473    }
474
475    pub(crate) fn record_alloc_inner(&mut self, page_number: u32, order: u8) {
476        let allocator = self.get_order_free_mut(order);
477        if allocator.get(page_number) {
478            // Need to split parent page
479            let upper_page = next_higher_order(page_number);
480            self.record_alloc_inner(upper_page, order + 1);
481            let allocator = self.get_order_free_mut(order);
482
483            let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
484            debug_assert!(free1 == page_number || free2 == page_number);
485            if free1 == page_number {
486                allocator.clear(free2);
487            } else {
488                allocator.clear(free1);
489            }
490        } else {
491            allocator.set(page_number);
492        }
493    }
494
495    /// data must have been initialized by `Self::init_new()`
496    pub(crate) fn free(&mut self, page_number: u32, order: u8) {
497        debug_assert!(self.get_order_free_mut(order).get(page_number));
498        debug_assert!(
499            self.get_order_allocated(order).get(page_number),
500            "Attempted to free page {page_number}, order {order}, which is not allocated",
501        );
502
503        self.get_order_allocated_mut(order).clear(page_number);
504
505        // Update the free index and merge free pages
506        self.free_inner(page_number, order);
507    }
508
509    pub(crate) fn free_inner(&mut self, page_number: u32, order: u8) {
510        if order == self.max_order {
511            let allocator = self.get_order_free_mut(order);
512            allocator.clear(page_number);
513            return;
514        }
515
516        let allocator = self.get_order_free_mut(order);
517        let buddy = buddy_page(page_number);
518        if allocator.get(buddy) {
519            allocator.clear(page_number);
520        } else {
521            // Merge into higher order page
522            allocator.set(buddy);
523            self.free_inner(next_higher_order(page_number), order + 1);
524        }
525    }
526
527    pub(crate) fn is_allocated(&self, page_number: u32, order: u8) -> bool {
528        self.get_order_allocated(order).get(page_number)
529    }
530
531    fn get_order_free_mut(&mut self, order: u8) -> &mut BtreeBitmap {
532        &mut self.free[order as usize]
533    }
534
535    fn get_order_allocated_mut(&mut self, order: u8) -> &mut U64GroupedBitmap {
536        &mut self.allocated[order as usize]
537    }
538
539    fn get_order_free(&self, order: u8) -> &BtreeBitmap {
540        &self.free[order as usize]
541    }
542
543    fn get_order_allocated(&self, order: u8) -> &U64GroupedBitmap {
544        &self.allocated[order as usize]
545    }
546}
547
548#[cfg(test)]
549mod test {
550    use crate::tree_store::page_store::buddy_allocator::BuddyAllocator;
551
552    #[test]
553    fn record_alloc_buddy() {
554        let num_pages = 256;
555        let mut allocator = BuddyAllocator::new(num_pages, num_pages);
556        assert_eq!(allocator.count_allocated_pages(), 0);
557
558        for page in 0..num_pages {
559            allocator.record_alloc(page, 0);
560        }
561        assert_eq!(allocator.count_allocated_pages(), num_pages);
562
563        assert!(allocator.alloc(0).is_none());
564
565        for page in 0..num_pages {
566            allocator.free(page, 0);
567        }
568        assert_eq!(allocator.count_allocated_pages(), 0);
569    }
570
571    #[test]
572    fn buddy_merge() {
573        let num_pages = 256;
574        let mut allocator = BuddyAllocator::new(num_pages, num_pages);
575        assert_eq!(allocator.count_allocated_pages(), 0);
576
577        for _ in 0..num_pages {
578            allocator.alloc(0).unwrap();
579        }
580        for page in 0..num_pages {
581            allocator.free(page, 0);
582        }
583        assert_eq!(allocator.count_allocated_pages(), 0);
584
585        // Test that everything got merged back together, so that we fill order 7 allocations
586        for _ in 0..(num_pages / 2u32.pow(7)) {
587            allocator.alloc(7).unwrap();
588        }
589    }
590
591    #[test]
592    fn alloc_large() {
593        let num_pages = 256;
594        let max_order = 7;
595        let mut allocator = BuddyAllocator::new(num_pages, num_pages);
596        assert_eq!(allocator.count_allocated_pages(), 0);
597
598        let mut allocated = vec![];
599        for order in 0..=max_order {
600            allocated.push((allocator.alloc(order).unwrap(), order));
601        }
602        assert_eq!(allocator.count_allocated_pages(), num_pages - 1);
603
604        for order in 1..=max_order {
605            assert!(allocator.alloc(order).is_none());
606        }
607
608        for (page, order) in allocated {
609            allocator.free(page, order);
610        }
611        assert_eq!(allocator.count_allocated_pages(), 0);
612    }
613
614    #[test]
615    fn serialized_size() {
616        // Check that serialized size is as expected for a full region
617        let max_region_pages = 1024 * 1024;
618        let allocator = BuddyAllocator::new(max_region_pages, max_region_pages);
619        let max_region_pages = u64::from(max_region_pages);
620        // 2x because that's the integral of 1/2^x to account for all the 21 orders
621        let allocated_state_bits = 2 * max_region_pages;
622        // + u32 * 21 because of the length field
623        let allocated_state_bytes = allocated_state_bits / 8 + 4 * 21;
624
625        // Add 2/64 to account for the intermediate index of the bitmap tree
626        let free_state_bits = allocated_state_bits + allocated_state_bits * 2 / 64;
627        // Add u32s to account for the stored offsets. Might be a bit of an over estimate
628        let free_state_bytes = free_state_bits / 8 + 4 * 21 * 3 + 21 * 4;
629        // BuddyAllocator overhead
630        let buddy_state_bytes = free_state_bytes + allocated_state_bytes + 21 * 2 * 4 + 8;
631
632        assert!((allocator.to_vec().len() as u64) <= buddy_state_bytes);
633    }
634}