heed/cookbook.rs
1//! A cookbook of examples on how to use heed. Here is the list of the different topics you can learn about:
2//!
3//! - [Decode Values on Demand](#decode-values-on-demand)
4//! - [Listing and Opening the Named Databases](#listing-and-opening-the-named-databases)
5//! - [Create Custom and Prefix Codecs](#create-custom-and-prefix-codecs)
6//! - [Change the Environment Size Dynamically](#change-the-environment-size-dynamically)
7//! - [Advanced Multithreaded Access of Entries](#advanced-multithreaded-access-of-entries)
8//!
9//! # Decode Values on Demand
10//!
11//! Sometimes, you need to iterate on the content of a database and
12//! conditionnaly decode the value depending on the key. You can use the
13//! [`Database::lazily_decode_data`] method to indicate this to heed.
14//!
15//! ```
16//! use std::collections::HashMap;
17//! use std::error::Error;
18//! use std::fs;
19//! use std::path::Path;
20//!
21//! use heed::types::*;
22//! use heed::{Database, EnvOpenOptions};
23//!
24//! pub type StringMap = HashMap<String, String>;
25//!
26//! fn main() -> Result<(), Box<dyn Error + Send + Sync>> {
27//! let path = Path::new("target").join("heed.mdb");
28//!
29//! fs::create_dir_all(&path)?;
30//!
31//! let env = unsafe {
32//! EnvOpenOptions::new()
33//! .map_size(1024 * 1024 * 100) // 100 MiB
34//! .open(&path)?
35//! };
36//!
37//! let mut wtxn = env.write_txn()?;
38//! let db: Database<Str, SerdeJson<StringMap>> = env.create_database(&mut wtxn, None)?;
39//!
40//! fill_with_data(&mut wtxn, db)?;
41//!
42//! // We make sure that iterating over this database will
43//! // not deserialize the values. We just want to decode
44//! // the value corresponding to 43th key.
45//! for (i, result) in db.lazily_decode_data().iter(&wtxn)?.enumerate() {
46//! let (_key, lazy_value) = result?;
47//! if i == 43 {
48//! // This is where the magic happens. We receive a Lazy type
49//! // that wraps a slice of bytes. We can decode on purpose.
50//! let value = lazy_value.decode()?;
51//! assert_eq!(value.get("secret"), Some(&String::from("434343")));
52//! break;
53//! }
54//! }
55//!
56//! Ok(())
57//! }
58//!
59//! fn fill_with_data(
60//! wtxn: &mut heed::RwTxn,
61//! db: Database<Str, SerdeJson<StringMap>>,
62//! ) -> heed::Result<()> {
63//! // This represents a very big value that we only want to decode when necessary.
64//! let mut big_string_map = HashMap::new();
65//! big_string_map.insert("key1".into(), "I am a very long string".into());
66//! big_string_map.insert("key2".into(), "I am a also very long string".into());
67//!
68//! for i in 0..100 {
69//! let key = format!("{i:5}");
70//! big_string_map.insert("secret".into(), format!("{i}{i}{i}"));
71//! db.put(wtxn, &key, &big_string_map)?;
72//! }
73//! Ok(())
74//! }
75//! ```
76//!
77//! # Listing and Opening the Named Databases
78//!
79//! Sometimes it is useful to list the databases available in an environment.
80//! LMDB automatically stores their names in the unnamed database, a database that doesn't
81//! need to be created in which you can write.
82//!
83//! Once you create new databases, after defining the [`EnvOpenOptions::max_dbs`]
84//! parameter, the names of those databases are automatically stored in the unnamed one.
85//!
86//! ```
87//! use std::error::Error;
88//! use std::fs;
89//! use std::path::Path;
90//!
91//! use heed::types::*;
92//! use heed::{Database, EnvOpenOptions};
93//!
94//! fn main() -> Result<(), Box<dyn Error>> {
95//! let env_path = Path::new("target").join("heed.mdb");
96//!
97//! fs::create_dir_all(&env_path)?;
98//!
99//! let env = unsafe {
100//! EnvOpenOptions::new()
101//! .map_size(10 * 1024 * 1024) // 10MB
102//! .max_dbs(3) // Number of opened databases
103//! .open(env_path)?
104//! };
105//!
106//! let rtxn = env.read_txn()?;
107//! // The database names are mixed with the user entries therefore we prefer
108//! // ignoring the values and try to open the databases one by one using the keys.
109//! let unnamed: Database<Str, DecodeIgnore> =
110//! env.open_database(&rtxn, None)?.expect("the unnamed database always exists");
111//!
112//! // The unnamed (or main) database contains the other
113//! // database names associated to empty values.
114//! for result in unnamed.iter(&rtxn)? {
115//! let (name, ()) = result?;
116//!
117//! if let Ok(Some(_db)) = env.open_database::<Str, Bytes>(&rtxn, Some(name)) {
118//! // We succeeded into opening a new database that
119//! // contains strings associated to raw bytes.
120//! }
121//! }
122//!
123//! // When opening databases in a read-only transaction
124//! // you must commit your read transaction to make your
125//! // freshly opened databases globally available.
126//! rtxn.commit()?;
127//!
128//! // If you abort (or drop) your read-only transaction
129//! // the database handle will be invalid outside
130//! // the transaction scope.
131//!
132//! Ok(())
133//! }
134//! ```
135//!
136//! # Create Custom and Prefix Codecs
137//!
138//! With heed you can store any kind of data and serialize it the way you want.
139//! To do so you'll need to create a codec by using the [`BytesEncode`] and [`BytesDecode`] traits.
140//!
141//! Now imagine that your data is lexicographically well ordered. You can now leverage
142//! the use of prefix codecs. Those are classic codecs but are only used to encode key prefixes.
143//!
144//! In this example we will store logs associated to a timestamp. By encoding the timestamp
145//! in big endian we can create a prefix codec that restricts a subset of the data. It is recommended
146//! to create codecs to encode prefixes when possible instead of using a slice of bytes.
147//!
148//! ```
149//! use std::borrow::Cow;
150//! use std::error::Error;
151//! use std::fs;
152//! use std::path::Path;
153//!
154//! use heed::types::*;
155//! use heed::{BoxedError, BytesDecode, BytesEncode, Database, EnvOpenOptions};
156//!
157//! #[derive(Debug, PartialEq, Eq)]
158//! pub enum Level {
159//! Debug,
160//! Warn,
161//! Error,
162//! }
163//!
164//! #[derive(Debug, PartialEq, Eq)]
165//! pub struct LogKey {
166//! timestamp: u32,
167//! level: Level,
168//! }
169//!
170//! pub struct LogKeyCodec;
171//!
172//! impl<'a> BytesEncode<'a> for LogKeyCodec {
173//! type EItem = LogKey;
174//!
175//! /// Encodes the u32 timestamp in big endian followed by the log level with a single byte.
176//! fn bytes_encode(log: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
177//! let (timestamp_bytes, level_byte) = match log {
178//! LogKey { timestamp, level: Level::Debug } => (timestamp.to_be_bytes(), 0),
179//! LogKey { timestamp, level: Level::Warn } => (timestamp.to_be_bytes(), 1),
180//! LogKey { timestamp, level: Level::Error } => (timestamp.to_be_bytes(), 2),
181//! };
182//!
183//! let mut output = Vec::new();
184//! output.extend_from_slice(×tamp_bytes);
185//! output.push(level_byte);
186//! Ok(Cow::Owned(output))
187//! }
188//! }
189//!
190//! impl<'a> BytesDecode<'a> for LogKeyCodec {
191//! type DItem = LogKey;
192//!
193//! fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
194//! use std::mem::size_of;
195//!
196//! let timestamp = match bytes.get(..size_of::<u32>()) {
197//! Some(bytes) => bytes.try_into().map(u32::from_be_bytes).unwrap(),
198//! None => return Err("invalid log key: cannot extract timestamp".into()),
199//! };
200//!
201//! let level = match bytes.get(size_of::<u32>()) {
202//! Some(&0) => Level::Debug,
203//! Some(&1) => Level::Warn,
204//! Some(&2) => Level::Error,
205//! Some(_) => return Err("invalid log key: invalid log level".into()),
206//! None => return Err("invalid log key: cannot extract log level".into()),
207//! };
208//!
209//! Ok(LogKey { timestamp, level })
210//! }
211//! }
212//!
213//! /// Encodes the high part of a timestamp. As it is located
214//! /// at the start of the key it can be used to only return
215//! /// the logs that appeared during a, rather long, period.
216//! pub struct LogAtHalfTimestampCodec;
217//!
218//! impl<'a> BytesEncode<'a> for LogAtHalfTimestampCodec {
219//! type EItem = u32;
220//!
221//! /// This method encodes only the prefix of the keys in this particular case, the timestamp.
222//! fn bytes_encode(half_timestamp: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
223//! Ok(Cow::Owned(half_timestamp.to_be_bytes()[..2].to_vec()))
224//! }
225//! }
226//!
227//! impl<'a> BytesDecode<'a> for LogAtHalfTimestampCodec {
228//! type DItem = LogKey;
229//!
230//! fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
231//! LogKeyCodec::bytes_decode(bytes)
232//! }
233//! }
234//!
235//! fn main() -> Result<(), Box<dyn Error>> {
236//! let path = Path::new("target").join("heed.mdb");
237//!
238//! fs::create_dir_all(&path)?;
239//!
240//! let env = unsafe {
241//! EnvOpenOptions::new()
242//! .map_size(10 * 1024 * 1024) // 10MB
243//! .max_dbs(3000)
244//! .open(path)?
245//! };
246//!
247//! let mut wtxn = env.write_txn()?;
248//! let db: Database<LogKeyCodec, Str> = env.create_database(&mut wtxn, None)?;
249//!
250//! db.put(
251//! &mut wtxn,
252//! &LogKey { timestamp: 1608326232, level: Level::Debug },
253//! "this is a very old log",
254//! )?;
255//! db.put(
256//! &mut wtxn,
257//! &LogKey { timestamp: 1708326232, level: Level::Debug },
258//! "fibonacci was executed in 21ms",
259//! )?;
260//! db.put(&mut wtxn, &LogKey { timestamp: 1708326242, level: Level::Error }, "fibonacci crashed")?;
261//! db.put(
262//! &mut wtxn,
263//! &LogKey { timestamp: 1708326272, level: Level::Warn },
264//! "fibonacci is running since 12s",
265//! )?;
266//!
267//! // We change the way we want to read our database by changing the key codec.
268//! // In this example we can prefix search only for the logs between a period of time
269//! // (the two high bytes of the u32 timestamp).
270//! let iter = db.remap_key_type::<LogAtHalfTimestampCodec>().prefix_iter(&wtxn, &1708326232)?;
271//!
272//! // As we filtered the log for a specific
273//! // period of time we must not see the very old log.
274//! for result in iter {
275//! let (LogKey { timestamp: _, level: _ }, content) = result?;
276//! assert_ne!(content, "this is a very old log");
277//! }
278//!
279//! Ok(())
280//! }
281//! ```
282//!
283//! # Change the Environment Size Dynamically
284//!
285//! You must specify the maximum size of an LMDB environment when you open it.
286//! Environment do not dynamically increase there size for performance reasons and also to
287//! have more control on it.
288//!
289//! Here is a simple example on the way to go to dynamically increase the size
290//! of an environment when you detect that it is going out of space.
291//!
292//! ```
293//! use std::error::Error;
294//! use std::fs;
295//! use std::path::Path;
296//!
297//! use heed::types::*;
298//! use heed::{Database, EnvOpenOptions};
299//!
300//! fn main() -> Result<(), Box<dyn Error>> {
301//! let path = Path::new("target").join("small-space.mdb");
302//!
303//! fs::create_dir_all(&path)?;
304//!
305//! let env = unsafe {
306//! EnvOpenOptions::new()
307//! .map_size(16384) // one page
308//! .open(&path)?
309//! };
310//!
311//! let mut wtxn = env.write_txn()?;
312//! let db: Database<Str, Str> = env.create_database(&mut wtxn, None)?;
313//!
314//! // Ho! Crap! We don't have enough space in this environment...
315//! assert!(matches!(
316//! fill_with_data(&mut wtxn, db),
317//! Err(heed::Error::Mdb(heed::MdbError::MapFull))
318//! ));
319//!
320//! drop(wtxn);
321//!
322//! // We need to increase the page size and we can only do that
323//! // when no transaction are running so closing the env is easier.
324//! env.prepare_for_closing().wait();
325//!
326//! let env = unsafe {
327//! EnvOpenOptions::new()
328//! .map_size(10 * 16384) // 10 pages
329//! .open(&path)?
330//! };
331//!
332//! let mut wtxn = env.write_txn()?;
333//! let db: Database<Str, Str> = env.create_database(&mut wtxn, None)?;
334//!
335//! // We now have enough space in the env to store all of our entries.
336//! assert!(matches!(fill_with_data(&mut wtxn, db), Ok(())));
337//!
338//! Ok(())
339//! }
340//!
341//! fn fill_with_data(wtxn: &mut heed::RwTxn, db: Database<Str, Str>) -> heed::Result<()> {
342//! for i in 0..1000 {
343//! let key = i.to_string();
344//! db.put(wtxn, &key, "I am a very long string")?;
345//! }
346//! Ok(())
347//! }
348//! ```
349//!
350//! # Advanced Multithreaded Access of Entries
351//!
352//! LMDB disallow sharing cursors amongs threads. It is only possible to send
353//! them between threads when the heed `read-txn-no-tls` feature is enabled.
354//!
355//! This limits some usecases that require a parallel access to the content of the databases
356//! to process stuff faster. This is the case of arroy, a multithreads fast approximate
357//! neighbors search library. I wrote [an article explaining how
358//! to read entries in parallel][arroy article].
359//!
360//! It is forbidden to write in an environement while reading in it. However, it is possible
361//! to keep pointers to the values of the entries returned by LMDB. Those pointers are valid
362//! until the end of the transaction.
363//!
364//! Here is a small example on how to declare a datastructure to be used in parallel across thread,
365//! safely. The unsafe part declare that the datastructure can be shared between thread despite
366//! the write transaction not being `Send` nor `Sync`.
367//!
368//! [arroy article]: https://blog.kerollmops.com/multithreading-and-memory-mapping-refining-ann-performance-with-arroy
369//!
370//! ```
371//! use std::collections::HashMap;
372//! use std::error::Error;
373//! use std::fs;
374//! use std::path::Path;
375//!
376//! use heed::types::*;
377//! use heed::{Database, EnvOpenOptions, RoTxn};
378//!
379//! fn main() -> Result<(), Box<dyn Error + Send + Sync>> {
380//! let path = Path::new("target").join("heed.mdb");
381//!
382//! fs::create_dir_all(&path)?;
383//!
384//! let env = unsafe {
385//! EnvOpenOptions::new()
386//! .map_size(1024 * 1024 * 100) // 100 MiB
387//! .open(&path)?
388//! };
389//!
390//! let mut wtxn = env.write_txn()?;
391//! let db: Database<Str, Str> = env.create_database(&mut wtxn, None)?;
392//!
393//! fill_with_data(&mut wtxn, db)?;
394//!
395//! let immutable_map = ImmutableMap::from_db(&wtxn, db)?;
396//!
397//! // We can share the immutable map over multiple threads because it is Sync.
398//! // It is safe because we keep the write transaction lifetime in this type.
399//! std::thread::scope(|s| {
400//! s.spawn(|| {
401//! let value = immutable_map.get("10");
402//! assert_eq!(value, Some("I am a very long string"));
403//! });
404//! s.spawn(|| {
405//! let value = immutable_map.get("20");
406//! assert_eq!(value, Some("I am a very long string"));
407//! });
408//! });
409//!
410//! // You can see that we always have it on the main thread.
411//! // We didn't sent it over threads.
412//! let value = immutable_map.get("50");
413//! assert_eq!(value, Some("I am a very long string"));
414//!
415//! Ok(())
416//! }
417//!
418//! fn fill_with_data(wtxn: &mut heed::RwTxn, db: Database<Str, Str>) -> heed::Result<()> {
419//! for i in 0..100 {
420//! let key = i.to_string();
421//! db.put(wtxn, &key, "I am a very long string")?;
422//! }
423//! Ok(())
424//! }
425//!
426//! struct ImmutableMap<'a> {
427//! map: HashMap<&'a str, &'a str>,
428//! }
429//!
430//! impl<'t> ImmutableMap<'t> {
431//! fn from_db(rtxn: &'t RoTxn, db: Database<Str, Str>) -> heed::Result<Self> {
432//! let mut map = HashMap::new();
433//! for result in db.iter(rtxn)? {
434//! let (k, v) = result?;
435//! map.insert(k, v);
436//! }
437//! Ok(ImmutableMap { map })
438//! }
439//!
440//! fn get(&self, key: &str) -> Option<&'t str> {
441//! self.map.get(key).copied()
442//! }
443//! }
444//!
445//! unsafe impl Sync for ImmutableMap<'_> {}
446//! ```
447//!
448
449// To let cargo generate doc links
450#![allow(unused_imports)]
451
452use crate::{BytesDecode, BytesEncode, Database, EnvOpenOptions};