redb/tree_store/page_store/
buddy_allocator.rs1use crate::tree_store::page_store::bitmap::{BtreeBitmap, U64GroupedBitmap};
2use crate::tree_store::page_store::page_manager::MAX_MAX_PAGE_ORDER;
3use crate::tree_store::PageNumber;
4use std::cmp::min;
5#[cfg(test)]
6use std::collections::HashSet;
7use std::mem::size_of;
8
9const MAX_ORDER_OFFSET: usize = 0;
10const PADDING: usize = 3;
11const NUM_PAGES_OFFSET: usize = MAX_ORDER_OFFSET + size_of::<u8>() + PADDING;
12const FREE_END_OFFSETS: usize = NUM_PAGES_OFFSET + size_of::<u32>();
13
14fn calculate_usable_order(pages: u32) -> u8 {
15 let max_order = (32 - pages.leading_zeros() - 1).try_into().unwrap();
16 min(MAX_MAX_PAGE_ORDER, max_order)
17}
18
19fn next_higher_order(page_number: u32) -> u32 {
20 page_number / 2
21}
22
23fn buddy_page(page_number: u32) -> u32 {
24 page_number ^ 1
25}
26
27pub(crate) struct BuddyAllocator {
31 allocated: Vec<U64GroupedBitmap>,
32 free: Vec<BtreeBitmap>,
33 len: u32,
34 max_order: u8,
35}
36
37impl BuddyAllocator {
38 pub(crate) fn new(num_pages: u32, max_page_capacity: u32) -> Self {
39 let max_order = calculate_usable_order(max_page_capacity);
40
41 let mut pages_for_order = max_page_capacity;
42 let mut free = vec![];
43 let mut allocated = vec![];
44 for _ in 0..=max_order {
45 free.push(BtreeBitmap::new(pages_for_order));
46
47 allocated.push(U64GroupedBitmap::new_empty(
48 pages_for_order,
49 pages_for_order,
50 ));
51 pages_for_order = next_higher_order(pages_for_order);
52 }
53
54 let mut accounted_pages = 0;
56 for order in (0..=max_order).rev() {
57 let order_size = 2u32.pow(order.into());
58 while accounted_pages + order_size <= num_pages {
59 let page = accounted_pages / order_size;
60 free[order as usize].clear(page);
61 accounted_pages += order_size;
62 }
63 }
64 assert_eq!(accounted_pages, num_pages);
65
66 Self {
67 allocated,
68 free,
69 len: num_pages,
70 max_order,
71 }
72 }
73
74 pub(crate) fn to_vec(&self) -> Vec<u8> {
83 let mut result = vec![];
84 result.push(self.max_order);
85 result.extend([0u8; 3]);
86 result.extend(self.len.to_le_bytes());
87
88 let mut data_offset = result.len() + (self.max_order as usize + 1) * 2 * size_of::<u32>();
89 let end_metadata = data_offset;
90 for order in &self.free {
91 data_offset += order.to_vec().len();
92 let offset_u32: u32 = data_offset.try_into().unwrap();
93 result.extend(offset_u32.to_le_bytes());
94 }
95 for order in &self.allocated {
96 data_offset += order.to_vec().len();
97 let offset_u32: u32 = data_offset.try_into().unwrap();
98 result.extend(offset_u32.to_le_bytes());
99 }
100 assert_eq!(end_metadata, result.len());
101 for order in &self.free {
102 result.extend(&order.to_vec());
103 }
104 for order in &self.allocated {
105 result.extend(&order.to_vec());
106 }
107
108 result
109 }
110
111 pub(crate) fn from_bytes(data: &[u8]) -> Self {
112 let max_order = data[MAX_ORDER_OFFSET];
113 let num_pages = u32::from_le_bytes(
114 data[NUM_PAGES_OFFSET..(NUM_PAGES_OFFSET + size_of::<u32>())]
115 .try_into()
116 .unwrap(),
117 );
118
119 let mut metadata = FREE_END_OFFSETS;
120 let mut data_start = FREE_END_OFFSETS + (max_order as usize + 1) * 2 * size_of::<u32>();
121
122 let mut free = vec![];
123 for _ in 0..=max_order {
124 let data_end = u32::from_le_bytes(
125 data[metadata..(metadata + size_of::<u32>())]
126 .try_into()
127 .unwrap(),
128 ) as usize;
129 free.push(BtreeBitmap::from_bytes(&data[data_start..data_end]));
130 data_start = data_end;
131 metadata += size_of::<u32>();
132 }
133 let mut allocated = vec![];
134 for _ in 0..=max_order {
135 let data_end = u32::from_le_bytes(
136 data[metadata..(metadata + size_of::<u32>())]
137 .try_into()
138 .unwrap(),
139 ) as usize;
140 allocated.push(U64GroupedBitmap::from_bytes(&data[data_start..data_end]));
141 data_start = data_end;
142 metadata += size_of::<u32>();
143 }
144
145 Self {
146 allocated,
147 free,
148 len: num_pages,
149 max_order,
150 }
151 }
152
153 #[inline]
154 pub(crate) fn highest_free_order(&self) -> Option<u8> {
155 (0..=self.max_order)
156 .rev()
157 .find(|order| self.get_order_free(*order).has_unset())
158 }
159
160 pub(crate) fn count_allocated_pages(&self) -> u32 {
161 self.len() - self.count_free_pages()
162 }
163
164 pub(crate) fn count_free_pages(&self) -> u32 {
165 let mut pages = 0;
166 for order in 0..=self.max_order {
167 pages += self.get_order_free(order).count_unset() * 2u32.pow(order.into());
168 }
169 pages
170 }
171
172 pub(crate) fn capacity(&self) -> u32 {
173 self.get_order_free(0).capacity()
174 }
175
176 pub(crate) fn get_max_order(&self) -> u8 {
177 self.max_order
178 }
179
180 fn find_free_order(&self, mut page: u32) -> Option<u8> {
181 for order in 0..=self.max_order {
182 if !self.get_order_free(order).get(page) {
183 return Some(order);
184 }
185 page = next_higher_order(page);
186 }
187 None
188 }
189
190 pub(crate) fn trailing_free_pages(&self) -> u32 {
191 let mut free_pages = 0;
192 let mut next_page = self.len() - 1;
193 while let Some(order) = self.find_free_order(next_page) {
194 let order_size = 2u32.pow(order.into());
195 free_pages += order_size;
196 if order_size > next_page {
197 break;
198 }
199 next_page -= order_size;
200 }
201
202 free_pages
203 }
204
205 pub(crate) fn make_state_for_savepoint(&self) -> Vec<u8> {
212 let mut result = vec![self.max_order];
213 result.extend(self.len().to_le_bytes());
214
215 let mut data_offset = result.len() + size_of::<u32>() * (self.max_order as usize + 1);
216 for order in 0..=self.max_order {
217 data_offset += self.allocated[order as usize].to_vec().len();
218 result.extend(u32::try_from(data_offset).unwrap().to_le_bytes());
219 }
220
221 for order in 0..=self.max_order {
222 result.extend(&self.allocated[order as usize].to_vec());
223 }
224
225 result
226 }
227
228 pub(crate) fn difference(
229 &self,
230 region: u32,
231 other: &BuddyAllocator,
232 output: &mut Vec<PageNumber>,
233 ) {
234 let num_pages = other.len();
235
236 for order in 0..=self.max_order {
237 let other_allocated = other.get_order_allocated(order);
238 let self_allocated = self.get_order_allocated(order);
239 for i in self_allocated.difference(other_allocated) {
240 if i >= num_pages {
241 break;
242 }
243 output.push(PageNumber::new(region, i, order));
244 }
245 }
246 }
247
248 #[cfg(any(test, fuzzing))]
249 pub(crate) fn get_allocated_pages(&self, region: u32, output: &mut Vec<PageNumber>) {
250 for order in 0..=self.max_order {
251 let allocated = self.get_order_allocated(order);
252 for i in allocated.iter() {
253 if i >= self.len() {
254 break;
255 }
256 output.push(PageNumber::new(region, i, order));
257 }
258 }
259
260 #[cfg(test)]
261 {
263 let mut allocated_check = HashSet::new();
264
265 for order in 0..=self.max_order {
266 let allocated = self.get_order_allocated(order);
267 for i in allocated.iter() {
268 if i >= self.len() {
269 break;
270 }
271 allocated_check.insert(PageNumber::new(region, i, order));
272 }
273 }
274
275 let mut free_check = HashSet::new();
276 for i in 0..self.len() {
277 if self.find_free_order(i).is_none() {
278 free_check.insert(PageNumber::new(region, i, 0));
279 }
280 }
281
282 let mut check_result = HashSet::new();
283 for page in &allocated_check {
284 check_result.extend(page.to_order0());
285 }
286 assert_eq!(free_check, check_result);
287 }
288 }
289
290 pub(crate) fn len(&self) -> u32 {
291 self.len
292 }
293
294 pub(crate) fn resize(&mut self, new_size: u32) {
295 self.debug_check_consistency();
296 assert!(new_size <= self.capacity());
297 if new_size > self.len() {
298 let mut processed_pages = self.len();
299 while processed_pages < new_size {
301 let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
302 let order_size = 2u32.pow(order.into());
303 let page = processed_pages / order_size;
304 debug_assert_eq!(processed_pages % order_size, 0);
305 if order >= self.max_order || processed_pages + order_size > new_size {
306 break;
307 }
308 self.free_inner(page, order);
309 processed_pages += order_size;
310 }
311 for order in (0..=self.max_order).rev() {
313 let order_size = 2u32.pow(order.into());
314 while processed_pages + order_size <= new_size {
315 let page = processed_pages / order_size;
316 self.free_inner(page, order);
317 processed_pages += order_size;
318 }
319 }
320 assert_eq!(processed_pages, new_size);
321 self.debug_check_consistency();
322 } else {
323 let mut processed_pages = new_size;
324 while processed_pages < self.len() {
326 let order: u8 = processed_pages.trailing_zeros().try_into().unwrap();
327 if order >= self.max_order {
328 break;
329 }
330 let order_size = 2u32.pow(order.into());
331 let page = processed_pages / order_size;
332 debug_assert_eq!(processed_pages % order_size, 0);
333 if processed_pages + order_size > self.len() {
334 break;
335 }
336 self.record_alloc_inner(page, order);
337 processed_pages += order_size;
338 }
339 for order in (0..=self.max_order).rev() {
341 let order_size = 2u32.pow(order.into());
342 while processed_pages + order_size <= self.len() {
343 let page = processed_pages / order_size;
344 self.record_alloc_inner(page, order);
345 processed_pages += order_size;
346 }
347 }
348 assert_eq!(processed_pages, self.len());
349 }
350 self.len = new_size;
351 }
352
353 #[allow(unused_variables)]
354 fn debug_check_consistency(&self) {
355 #[cfg(all(debug_assertions, not(fuzzing)))]
357 {
358 let mut processed = 0;
359 while processed < self.len() {
361 let mut found = false;
362 let mut page = processed;
363 for order in 0..=self.max_order {
364 let allocator = &self.free[order as usize];
365 if !allocator.get(page) {
366 assert!(!found);
367 found = true;
368 }
369 page = next_higher_order(page);
370 }
371 processed += 1;
372 }
373
374 for order in (0..self.max_order).rev() {
376 let order_len = self.len() / (2u32.pow(order.into()));
377 let allocator = &self.free[order as usize];
378 for page in 0..order_len {
379 if !allocator.get(page) {
380 let buddy = buddy_page(page);
381 let buddy_allocated = allocator.get(buddy);
382 assert!(buddy_allocated, "order={order} page={page} buddy={buddy}",);
383 }
384 }
385 }
386 }
387 }
388
389 pub(crate) fn alloc_lowest(&mut self, order: u8) -> Option<u32> {
392 let page = self.alloc_lowest_inner(order);
393 if let Some(page_number) = page {
394 debug_assert!(!self.get_order_allocated(order).get(page_number));
395 self.get_order_allocated_mut(order).set(page_number);
396 }
397 page
398 }
399
400 pub(crate) fn alloc_lowest_inner(&mut self, order: u8) -> Option<u32> {
401 let mut best_index_at_order = self.alloc_inner(order)?;
403 let mut best = (best_index_at_order, order);
405
406 let mut multiplier = 2;
408 for i in (order + 1)..=self.max_order {
409 if let Some(index) = self.alloc_inner(i) {
410 let index_at_order = index * multiplier;
411 if index_at_order < best_index_at_order {
412 self.free_inner(best.0, best.1);
413 best_index_at_order = index_at_order;
414 best = (index, i);
415 } else {
416 self.free_inner(index, i);
417 }
418 }
419 multiplier *= 2;
420 }
421
422 while best.1 > order {
424 let (best_index, best_order) = best;
425 let (free1, free2) = (best_index * 2, best_index * 2 + 1);
426 let allocator = self.get_order_free_mut(best_order - 1);
427 debug_assert!(allocator.get(free1));
428 debug_assert!(allocator.get(free2));
429 allocator.clear(free2);
430 best = (free1, best_order - 1);
431 }
432 assert_eq!(best.0, best_index_at_order);
433
434 Some(best.0)
435 }
436
437 pub(crate) fn alloc(&mut self, order: u8) -> Option<u32> {
438 let page = self.alloc_inner(order);
439 if let Some(page_number) = page {
440 debug_assert!(!self.get_order_allocated(order).get(page_number));
441 self.get_order_allocated_mut(order).set(page_number);
442 }
443 page
444 }
445
446 pub(crate) fn alloc_inner(&mut self, order: u8) -> Option<u32> {
447 if order > self.max_order {
448 return None;
449 }
450 let allocator = self.get_order_free_mut(order);
451 if let Some(x) = allocator.alloc() {
452 Some(x)
453 } else {
454 let upper_page = self.alloc_inner(order + 1)?;
456 let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
457 let allocator = self.get_order_free_mut(order);
458 debug_assert!(allocator.get(free1));
459 debug_assert!(allocator.get(free2));
460 allocator.clear(free2);
461
462 Some(free1)
463 }
464 }
465
466 pub(crate) fn record_alloc(&mut self, page_number: u32, order: u8) {
468 assert!(order <= self.max_order);
469 self.get_order_allocated_mut(order).set(page_number);
471 self.record_alloc_inner(page_number, order);
473 }
474
475 pub(crate) fn record_alloc_inner(&mut self, page_number: u32, order: u8) {
476 let allocator = self.get_order_free_mut(order);
477 if allocator.get(page_number) {
478 let upper_page = next_higher_order(page_number);
480 self.record_alloc_inner(upper_page, order + 1);
481 let allocator = self.get_order_free_mut(order);
482
483 let (free1, free2) = (upper_page * 2, upper_page * 2 + 1);
484 debug_assert!(free1 == page_number || free2 == page_number);
485 if free1 == page_number {
486 allocator.clear(free2);
487 } else {
488 allocator.clear(free1);
489 }
490 } else {
491 allocator.set(page_number);
492 }
493 }
494
495 pub(crate) fn free(&mut self, page_number: u32, order: u8) {
497 debug_assert!(self.get_order_free_mut(order).get(page_number));
498 debug_assert!(
499 self.get_order_allocated(order).get(page_number),
500 "Attempted to free page {page_number}, order {order}, which is not allocated",
501 );
502
503 self.get_order_allocated_mut(order).clear(page_number);
504
505 self.free_inner(page_number, order);
507 }
508
509 pub(crate) fn free_inner(&mut self, page_number: u32, order: u8) {
510 if order == self.max_order {
511 let allocator = self.get_order_free_mut(order);
512 allocator.clear(page_number);
513 return;
514 }
515
516 let allocator = self.get_order_free_mut(order);
517 let buddy = buddy_page(page_number);
518 if allocator.get(buddy) {
519 allocator.clear(page_number);
520 } else {
521 allocator.set(buddy);
523 self.free_inner(next_higher_order(page_number), order + 1);
524 }
525 }
526
527 pub(crate) fn is_allocated(&self, page_number: u32, order: u8) -> bool {
528 self.get_order_allocated(order).get(page_number)
529 }
530
531 fn get_order_free_mut(&mut self, order: u8) -> &mut BtreeBitmap {
532 &mut self.free[order as usize]
533 }
534
535 fn get_order_allocated_mut(&mut self, order: u8) -> &mut U64GroupedBitmap {
536 &mut self.allocated[order as usize]
537 }
538
539 fn get_order_free(&self, order: u8) -> &BtreeBitmap {
540 &self.free[order as usize]
541 }
542
543 fn get_order_allocated(&self, order: u8) -> &U64GroupedBitmap {
544 &self.allocated[order as usize]
545 }
546}
547
548#[cfg(test)]
549mod test {
550 use crate::tree_store::page_store::buddy_allocator::BuddyAllocator;
551
552 #[test]
553 fn record_alloc_buddy() {
554 let num_pages = 256;
555 let mut allocator = BuddyAllocator::new(num_pages, num_pages);
556 assert_eq!(allocator.count_allocated_pages(), 0);
557
558 for page in 0..num_pages {
559 allocator.record_alloc(page, 0);
560 }
561 assert_eq!(allocator.count_allocated_pages(), num_pages);
562
563 assert!(allocator.alloc(0).is_none());
564
565 for page in 0..num_pages {
566 allocator.free(page, 0);
567 }
568 assert_eq!(allocator.count_allocated_pages(), 0);
569 }
570
571 #[test]
572 fn buddy_merge() {
573 let num_pages = 256;
574 let mut allocator = BuddyAllocator::new(num_pages, num_pages);
575 assert_eq!(allocator.count_allocated_pages(), 0);
576
577 for _ in 0..num_pages {
578 allocator.alloc(0).unwrap();
579 }
580 for page in 0..num_pages {
581 allocator.free(page, 0);
582 }
583 assert_eq!(allocator.count_allocated_pages(), 0);
584
585 for _ in 0..(num_pages / 2u32.pow(7)) {
587 allocator.alloc(7).unwrap();
588 }
589 }
590
591 #[test]
592 fn alloc_large() {
593 let num_pages = 256;
594 let max_order = 7;
595 let mut allocator = BuddyAllocator::new(num_pages, num_pages);
596 assert_eq!(allocator.count_allocated_pages(), 0);
597
598 let mut allocated = vec![];
599 for order in 0..=max_order {
600 allocated.push((allocator.alloc(order).unwrap(), order));
601 }
602 assert_eq!(allocator.count_allocated_pages(), num_pages - 1);
603
604 for order in 1..=max_order {
605 assert!(allocator.alloc(order).is_none());
606 }
607
608 for (page, order) in allocated {
609 allocator.free(page, order);
610 }
611 assert_eq!(allocator.count_allocated_pages(), 0);
612 }
613
614 #[test]
615 fn serialized_size() {
616 let max_region_pages = 1024 * 1024;
618 let allocator = BuddyAllocator::new(max_region_pages, max_region_pages);
619 let max_region_pages = u64::from(max_region_pages);
620 let allocated_state_bits = 2 * max_region_pages;
622 let allocated_state_bytes = allocated_state_bits / 8 + 4 * 21;
624
625 let free_state_bits = allocated_state_bits + allocated_state_bits * 2 / 64;
627 let free_state_bytes = free_state_bits / 8 + 4 * 21 * 3 + 21 * 4;
629 let buddy_state_bytes = free_state_bytes + allocated_state_bytes + 21 * 2 * 4 + 8;
631
632 assert!((allocator.to_vec().len() as u64) <= buddy_state_bytes);
633 }
634}