redb/tree_store/page_store/
buddy_allocator.rs
1use crate::tree_store::PageNumber;
2use crate::tree_store::page_store::bitmap::{BtreeBitmap, U64GroupedBitmap};
3use crate::tree_store::page_store::page_manager::MAX_MAX_PAGE_ORDER;
4use std::cmp::min;
5#[cfg(test)]
6use std::collections::HashSet;
7use std::mem::size_of;
8
9const MAX_ORDER_OFFSET: usize = 0;
10const PADDING: usize = 3;
11const NUM_PAGES_OFFSET: usize = MAX_ORDER_OFFSET + size_of::<u8>() + PADDING;
12const FREE_END_OFFSETS: usize = NUM_PAGES_OFFSET + size_of::<u32>();
13
14fn calculate_usable_order(pages: u32) -> u8 {
15 let max_order = (32 - pages.leading_zeros() - 1).try_into().unwrap();
16 min(MAX_MAX_PAGE_ORDER, max_order)
17}
18
19fn next_higher_order(page_number: u32) -> u32 {
20 page_number / 2
21}
22
23fn buddy_page(page_number: u32) -> u32 {
24 page_number ^ 1
25}
26
27pub(crate) struct BuddyAllocator {
31 allocated: Vec<U64GroupedBitmap>,
32 free: Vec<BtreeBitmap>,
33 len: u32,
34 max_order: u8,
35}
36
37impl BuddyAllocator {
38 pub(crate) fn new(num_pages: u32, max_page_capacity: u32) -> Self {
39 let max_order = calculate_usable_order(max_page_capacity);
40
41 let mut pages_for_order = max_page_capacity;
42 let mut free = vec![];
43 let mut allocated = vec![];
44 for _ in 0..=max_order {
45 free.push(BtreeBitmap::new(pages_for_order));
46
47 allocated.push(U64GroupedBitmap::new_empty(
48 pages_for_order,
49 pages_for_order,
50 ));
51 pages_for_order = next_higher_order(pages_for_order);
52 }
53
54 let mut accounted_pages = 0;
56 for order in (0..=max_order).rev() {
57 let order_size = 2u32.pow(order.into());
58 while accounted_pages + order_size <= num_pages {
59 let page = accounted_pages / order_size;
60 free[order as usize].clear(page);
61 accounted_pages += order_size;
62 }
63 }
64 assert_eq!(accounted_pages, num_pages);
65
66 Self {
67 allocated,
68 free,
69 len: num_pages,
70 max_order,
71 }
72 }
73
74 pub(crate) fn to_vec(&self) -> Vec<u8> {
83 let mut result = vec![];
84 result.push(self.max_order);
85 result.extend([0u8; 3]);
86 result.extend(self.len.to_le_bytes());
87
88 let mut data_offset = result.len() + (self.max_order as usize + 1) * 2 * size_of::<u32>();
89 let end_metadata = data_offset;
90 for order in &self.free {
91 data_offset += order.to_vec().len();
92 let offset_u32: u32 = data_offset.try_into().unwrap();
93 result.extend(offset_u32.to_le_bytes());
94 }
95 for order in &self.allocated {
96 data_offset += order.to_vec().len();
97 let offset_u32: u32 = data_offset.try_into().unwrap();
98 result.extend(offset_u32.to_le_bytes());
99 }
100 assert_eq!(end_metadata, result.len());
101 for order in &self.free {
102 result.extend(&order.to_vec());
103 }
104 for order in &self.allocated {
105 result.extend(&order.to_vec());
106 }
107
108 result
109 }
110
111 pub(crate) fn from_bytes(data: &[u8]) -> Self {
112 let max_order = data[MAX_ORDER_OFFSET];
113 let num_pages = u32::from_le_bytes(
114 data[NUM_PAGES_OFFSET..(NUM_PAGES_OFFSET + size_of::<u32>())]
115 .try_into()
116 .unwrap(),
117 );
118
119 let mut metadata = FREE_END_OFFSETS;
120 let mut data_start = FREE_END_OFFSETS + (max_order as usize + 1) * 2 * size_of::<u32>();
121
122 let mut free = vec![];
123 for _ in 0..=max_order {
124 let data_end = u32::from_le_bytes(
125 data[metadata..(metadata + size_of::<u32>())]
126 .try_into()
127 .unwrap(),
128 ) as usize;
129 free.push(BtreeBitmap::from_bytes(&data[data_start..data_end]));
130 data_start = data_end;
131 metadata += size_of::<u32>();
132 }
133 let mut allocated = vec![];
134 for _ in 0..=max_order {
135 let data_end = u32::from_le_bytes(
136 data[metadata..(metadata + size_of::<u32>())]
137 .try_into()
138 .unwrap(),
139 ) as usize;
140 allocated.push(U64GroupedBitmap::from_bytes(&data[data_start..data_end]));
141 data_start = data_end;
142 metadata += size_of::<u32>();
143 }
144
145 Self {
146 allocated,
147 free,
148 len: num_pages,
149 max_order,
150 }
151 }
152
153 #[inline]
154 pub(crate) fn highest_free_order(&self) -> Option<u8> {
155 (0..=self.max_order)
156 .rev()
157 .find(|order| self.get_order_free(*order).has_unset())
158 }
159
160 pub(crate) fn count_allocated_pages(&self) -> u32 {
161 self.len() - self.count_free_pages()
162 }
163
164 pub(crate) fn count_free_pages(&self) -> u32 {
165 let mut pages = 0;
166 for order in 0..=self.max_order {
167 pages += self.get_order_free(order).count_unset() * 2u32.pow(order.into());
168 }
169 pages
170 }
171
172 pub(crate) fn capacity(&self) -> u32 {
173 self.get_order_free(0).capacity()
174 }
175
176 pub(crate) fn get_max_order(&self) -> u8 {
177 self.max_order
178 }
179
180 fn find_free_order(&self, mut page: u32) -> Option<u8> {
181 for order in 0..=self.max_order {
182 if !self.get_order_free(order).get(page) {
183 return Some(order);
184 }
185 page = next_higher_order(page);
186 }
187 None
188 }
189
190 pub(crate) fn trailing_free_pages(&self) -> u32 {
191 let mut free_pages = 0;
192 let mut next_page = self.len() - 1;
193 while let Some(order) = self.find_free_order(next_page) {
194 let order_size = 2u32.pow(order.into());
195 free_pages += order_size;
196 if order_size > next_page {
197 break;
198 }
199 next_page -= order_size;
200 }
201
202 free_pages
203 }
204
205 pub(crate) fn from_savepoint_state(data: &[u8]) -> Self {
207 let mut offset = 0;
208 let max_order = data[offset];
209 offset += 1;
210 let len = u32::from_le_bytes(
211 data[offset..(offset + size_of::<u32>())]
212 .try_into()
213 .unwrap(),
214 );
215 offset += size_of::<u32>();
216
217 let mut data_start = offset + size_of::<u32>() * (max_order as usize + 1);
218 let mut allocated_sets = vec![];
219 for _ in 0..=max_order {
220 let data_end = u32::from_le_bytes(
221 data[offset..(offset + size_of::<u32>())]
222 .try_into()
223 .unwrap(),
224 ) as usize;
225 offset += size_of::<u32>();
226 allocated_sets.push(U64GroupedBitmap::from_bytes(&data[data_start..data_end]));
227 data_start = data_end;
228 }
229 assert_eq!(data_start, data.len());
230
231 let mut result = Self::new(len, allocated_sets[0].capacity());
232
233 for (order, allocated) in allocated_sets.iter().enumerate() {
234 for page_number in allocated.iter() {
235 result.record_alloc(page_number, order.try_into().unwrap());
236 }
237 }
238
239 result
240 }
241
242 pub(crate) fn make_state_for_savepoint(&self) -> Vec<u8> {
249 let mut result = vec![self.max_order];
250 result.extend(self.len().to_le_bytes());
251
252 let mut data_offset = result.len() + size_of::<u32>() * (self.max_order as usize + 1);
253 for order in 0..=self.max_order {
254 data_offset += self.allocated[order as usize].to_vec().len();
255 result.extend(u32::try_from(data_offset).unwrap().to_le_bytes());
256 }
257
258 for order in 0..=self.max_order {
259 result.extend(&self.allocated[order as usize].to_vec());
260 }
261
262 result
263 }
264
265 pub(crate) fn difference(
267 &self,
268 region: u32,
269 other: &BuddyAllocator,
270 output: &mut Vec<PageNumber>,
271 ) {
272 let num_pages = self.len();
273
274 for order in 0..=self.max_order {
275 let other_allocated = other.get_order_allocated(order);
276 let self_allocated = self.get_order_allocated(order);
277 for i in self_allocated.difference(other_allocated) {
278 if i >= num_pages {
279 break;
280 }
281 output.push(PageNumber::new(region, i, order));
282 }
283 }
284 }
285
286 pub(crate) fn get_allocated_pages(&self, region: u32, output: &mut Vec<PageNumber>) {
287 for order in 0..=self.max_order {
288 let allocated = self.get_order_allocated(order);
289 for i in allocated.iter() {
290 if i >= self.len() {
291 break;
292 }
293 output.push(PageNumber::new(region, i, order));
294 }
295 }
296
297 #[cfg(test)]
298 {
300 let mut allocated_check = HashSet::new();
301
302 for order in 0..=self.max_order {
303 let allocated = self.get_order_allocated(order);
304 for i in allocated.iter() {
305 if i >= self.len() {
306 break;
307 }
308 allocated_check.insert(PageNumber::new(region, i, order));
309 }
310 }
311
312 let mut free_check = HashSet::new();
313 for i in 0..self.len() {
314 if self.find_free_order(i).is_none() {
315 free_check.insert(PageNumber::new(region, i, 0));
316 }
317 }
318
319 let mut check_result = HashSet::new();
320 for page in &allocated_check {
321 check_result.extend(page.to_order0());
322 }
323 assert_eq!(free_check, check_result);
324 }
325 }
326
327 pub(crate) fn len(&self) -> u32 {
328 self.len
329 }
330
331 pub(crate) fn resize(&mut self, new_size: u32) {
332 self.debug_check_consistency();
333 assert!(new_size <= self.capacity());
334 if new_size > self.len() {
335 let mut processed_pages = self.len();
336 while processed_pages < new_size {
338 let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
339 let order_size = 2u32.pow(order.into());
340 let page = processed_pages / order_size;
341 debug_assert_eq!(processed_pages % order_size, 0);
342 if order >= self.max_order || processed_pages + order_size > new_size {
343 break;
344 }
345 self.free_inner(page, order);
346 processed_pages += order_size;
347 }
348 for order in (0..=self.max_order).rev() {
350 let order_size = 2u32.pow(order.into());
351 while processed_pages + order_size <= new_size {
352 let page = processed_pages / order_size;
353 self.free_inner(page, order);
354 processed_pages += order_size;
355 }
356 }
357 assert_eq!(processed_pages, new_size);
358 self.debug_check_consistency();
359 } else {
360 let mut processed_pages = new_size;
361 while processed_pages < self.len() {
363 let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
364 if order >= self.max_order {
365 break;
366 }
367 let order_size = 2u32.pow(order.into());
368 let page = processed_pages / order_size;
369 debug_assert_eq!(processed_pages % order_size, 0);
370 if processed_pages + order_size > self.len() {
371 break;
372 }
373 self.record_alloc_inner(page, order);
374 processed_pages += order_size;
375 }
376 for order in (0..=self.max_order).rev() {
378 let order_size = 2u32.pow(order.into());
379 while processed_pages + order_size <= self.len() {
380 let page = processed_pages / order_size;
381 self.record_alloc_inner(page, order);
382 processed_pages += order_size;
383 }
384 }
385 assert_eq!(processed_pages, self.len());
386 }
387 self.len = new_size;
388 }
389
390 #[allow(unused_variables)]
391 fn debug_check_consistency(&self) {
392 #[cfg(all(debug_assertions, not(fuzzing)))]
394 {
395 let mut processed = 0;
396 while processed < self.len() {
398 let mut found = false;
399 let mut page = processed;
400 for order in 0..=self.max_order {
401 let allocator = &self.free[order as usize];
402 if !allocator.get(page) {
403 assert!(!found);
404 found = true;
405 }
406 page = next_higher_order(page);
407 }
408 processed += 1;
409 }
410
411 for order in (0..self.max_order).rev() {
413 let order_len = self.len() / (2u32.pow(order.into()));
414 let allocator = &self.free[order as usize];
415 for page in 0..order_len {
416 if !allocator.get(page) {
417 let buddy = buddy_page(page);
418 let buddy_allocated = allocator.get(buddy);
419 assert!(buddy_allocated, "order={order} page={page} buddy={buddy}",);
420 }
421 }
422 }
423 }
424 }
425
426 pub(crate) fn alloc_lowest(&mut self, order: u8) -> Option<u32> {
429 let page = self.alloc_lowest_inner(order);
430 if let Some(page_number) = page {
431 debug_assert!(!self.get_order_allocated(order).get(page_number));
432 self.get_order_allocated_mut(order).set(page_number);
433 }
434 page
435 }
436
437 pub(crate) fn alloc_lowest_inner(&mut self, order: u8) -> Option<u32> {
438 let mut best_index_at_order = self.alloc_inner(order)?;
440 let mut best = (best_index_at_order, order);
442
443 let mut multiplier = 2;
445 for i in (order + 1)..=self.max_order {
446 if let Some(index) = self.alloc_inner(i) {
447 let index_at_order = index * multiplier;
448 if index_at_order < best_index_at_order {
449 self.free_inner(best.0, best.1);
450 best_index_at_order = index_at_order;
451 best = (index, i);
452 } else {
453 self.free_inner(index, i);
454 }
455 }
456 multiplier *= 2;
457 }
458
459 while best.1 > order {
461 let (best_index, best_order) = best;
462 let (free1, free2) = (best_index * 2, best_index * 2 + 1);
463 let allocator = self.get_order_free_mut(best_order - 1);
464 debug_assert!(allocator.get(free1));
465 debug_assert!(allocator.get(free2));
466 allocator.clear(free2);
467 best = (free1, best_order - 1);
468 }
469 assert_eq!(best.0, best_index_at_order);
470
471 Some(best.0)
472 }
473
474 pub(crate) fn alloc(&mut self, order: u8) -> Option<u32> {
475 let page = self.alloc_inner(order);
476 if let Some(page_number) = page {
477 debug_assert!(!self.get_order_allocated(order).get(page_number));
478 self.get_order_allocated_mut(order).set(page_number);
479 }
480 page
481 }
482
483 pub(crate) fn alloc_inner(&mut self, order: u8) -> Option<u32> {
484 if order > self.max_order {
485 return None;
486 }
487 let allocator = self.get_order_free_mut(order);
488 if let Some(x) = allocator.alloc() {
489 Some(x)
490 } else {
491 let upper_page = self.alloc_inner(order + 1)?;
493 let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
494 let allocator = self.get_order_free_mut(order);
495 debug_assert!(allocator.get(free1));
496 debug_assert!(allocator.get(free2));
497 allocator.clear(free2);
498
499 Some(free1)
500 }
501 }
502
503 pub(crate) fn record_alloc(&mut self, page_number: u32, order: u8) {
505 assert!(order <= self.max_order);
506 self.get_order_allocated_mut(order).set(page_number);
508 self.record_alloc_inner(page_number, order);
510 }
511
512 pub(crate) fn record_alloc_inner(&mut self, page_number: u32, order: u8) {
513 let allocator = self.get_order_free_mut(order);
514 if allocator.get(page_number) {
515 let upper_page = next_higher_order(page_number);
517 self.record_alloc_inner(upper_page, order + 1);
518 let allocator = self.get_order_free_mut(order);
519
520 let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
521 debug_assert!(free1 == page_number || free2 == page_number);
522 if free1 == page_number {
523 allocator.clear(free2);
524 } else {
525 allocator.clear(free1);
526 }
527 } else {
528 allocator.set(page_number);
529 }
530 }
531
532 pub(crate) fn free(&mut self, page_number: u32, order: u8) {
534 debug_assert!(self.get_order_free_mut(order).get(page_number));
535 debug_assert!(
536 self.get_order_allocated(order).get(page_number),
537 "Attempted to free page {page_number}, order {order}, which is not allocated",
538 );
539
540 self.get_order_allocated_mut(order).clear(page_number);
541
542 self.free_inner(page_number, order);
544 }
545
546 pub(crate) fn free_inner(&mut self, page_number: u32, order: u8) {
547 if order == self.max_order {
548 let allocator = self.get_order_free_mut(order);
549 allocator.clear(page_number);
550 return;
551 }
552
553 let allocator = self.get_order_free_mut(order);
554 let buddy = buddy_page(page_number);
555 if allocator.get(buddy) {
556 allocator.clear(page_number);
557 } else {
558 allocator.set(buddy);
560 self.free_inner(next_higher_order(page_number), order + 1);
561 }
562 }
563
564 pub(crate) fn is_allocated(&self, page_number: u32, order: u8) -> bool {
565 self.get_order_allocated(order).get(page_number)
566 }
567
568 fn get_order_free_mut(&mut self, order: u8) -> &mut BtreeBitmap {
569 &mut self.free[order as usize]
570 }
571
572 fn get_order_allocated_mut(&mut self, order: u8) -> &mut U64GroupedBitmap {
573 &mut self.allocated[order as usize]
574 }
575
576 fn get_order_free(&self, order: u8) -> &BtreeBitmap {
577 &self.free[order as usize]
578 }
579
580 fn get_order_allocated(&self, order: u8) -> &U64GroupedBitmap {
581 &self.allocated[order as usize]
582 }
583}
584
585#[cfg(test)]
586mod test {
587 use crate::tree_store::page_store::buddy_allocator::BuddyAllocator;
588
589 #[test]
590 fn record_alloc_buddy() {
591 let num_pages = 256;
592 let mut allocator = BuddyAllocator::new(num_pages, num_pages);
593 assert_eq!(allocator.count_allocated_pages(), 0);
594
595 for page in 0..num_pages {
596 allocator.record_alloc(page, 0);
597 }
598 assert_eq!(allocator.count_allocated_pages(), num_pages);
599
600 assert!(allocator.alloc(0).is_none());
601
602 for page in 0..num_pages {
603 allocator.free(page, 0);
604 }
605 assert_eq!(allocator.count_allocated_pages(), 0);
606 }
607
608 #[test]
609 fn buddy_merge() {
610 let num_pages = 256;
611 let mut allocator = BuddyAllocator::new(num_pages, num_pages);
612 assert_eq!(allocator.count_allocated_pages(), 0);
613
614 for _ in 0..num_pages {
615 allocator.alloc(0).unwrap();
616 }
617 for page in 0..num_pages {
618 allocator.free(page, 0);
619 }
620 assert_eq!(allocator.count_allocated_pages(), 0);
621
622 for _ in 0..(num_pages / 2u32.pow(7)) {
624 allocator.alloc(7).unwrap();
625 }
626 }
627
628 #[test]
629 fn alloc_large() {
630 let num_pages = 256;
631 let max_order = 7;
632 let mut allocator = BuddyAllocator::new(num_pages, num_pages);
633 assert_eq!(allocator.count_allocated_pages(), 0);
634
635 let mut allocated = vec![];
636 for order in 0..=max_order {
637 allocated.push((allocator.alloc(order).unwrap(), order));
638 }
639 assert_eq!(allocator.count_allocated_pages(), num_pages - 1);
640
641 for order in 1..=max_order {
642 assert!(allocator.alloc(order).is_none());
643 }
644
645 for (page, order) in allocated {
646 allocator.free(page, order);
647 }
648 assert_eq!(allocator.count_allocated_pages(), 0);
649 }
650
651 #[test]
652 fn make_savepoint_state() {
653 let num_pages = 256;
654 let mut allocator = BuddyAllocator::new(num_pages, num_pages);
655
656 allocator.record_alloc(7, 0);
658 allocator.alloc(0);
659 allocator.alloc(1);
660 allocator.alloc(3);
661 allocator.alloc(0);
662 allocator.alloc(0);
663
664 let allocator2 =
665 BuddyAllocator::from_savepoint_state(&allocator.make_state_for_savepoint());
666 assert_eq!(allocator.to_vec(), allocator2.to_vec());
667 }
668
669 #[test]
670 fn serialized_size() {
671 let max_region_pages = 1024 * 1024;
673 let allocator = BuddyAllocator::new(max_region_pages, max_region_pages);
674 let max_region_pages = u64::from(max_region_pages);
675 let allocated_state_bits = 2 * max_region_pages;
677 let allocated_state_bytes = allocated_state_bits / 8 + 4 * 21;
679
680 let free_state_bits = allocated_state_bits + allocated_state_bits * 2 / 64;
682 let free_state_bytes = free_state_bits / 8 + 4 * 21 * 3 + 21 * 4;
684 let buddy_state_bytes = free_state_bytes + allocated_state_bytes + 21 * 2 * 4 + 8;
686
687 assert!((allocator.to_vec().len() as u64) <= buddy_state_bytes);
688 }
689}