criterion_cycles_per_byte/
lib.rs

1//! `CyclesPerByte` measures clock cycles using the CPU read time-stamp counter instruction.
2//!
3//! ```rust
4//! # fn fibonacci_slow(_: usize) {}
5//! # fn fibonacci_fast(_: usize) {}
6//! use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
7//! use criterion_cycles_per_byte::CyclesPerByte;
8//!
9//! fn bench(c: &mut Criterion<CyclesPerByte>) {
10//!     let mut group = c.benchmark_group("fibonacci");
11//!
12//!     for i in 0..20 {
13//!         group.bench_function(BenchmarkId::new("slow", i), |b| b.iter(|| fibonacci_slow(i)));
14//!         group.bench_function(BenchmarkId::new("fast", i), |b| b.iter(|| fibonacci_fast(i)));
15//!     }
16//!
17//!     group.finish()
18//! }
19//!
20//! criterion_group!(
21//!     name = my_bench;
22//!     config = Criterion::default().with_measurement(CyclesPerByte);
23//!     targets = bench
24//! );
25//! criterion_main!(my_bench);
26//! ```
27
28use criterion::{
29    measurement::{Measurement, ValueFormatter},
30    Throughput,
31};
32
33/// `CyclesPerByte` measures clock cycles using the CPU read time-stamp counter instruction. `cpb` is
34/// the preferred measurement for cryptographic algorithms.
35pub struct CyclesPerByte;
36
37// WARN: does not check for the cpu feature; but we'd panic anyway so...
38#[inline(always)]
39fn cycle_counter() -> u64 {
40    #[cfg(target_arch = "x86")]
41    use core::arch::x86::*;
42    #[cfg(target_arch = "x86_64")]
43    use core::arch::x86_64::*;
44
45    unsafe {
46        cfg_if::cfg_if! {
47            if #[cfg(all(rdpru, any(target_arch = "x86_64", target_arch = "x86")))] {
48                // `LFENCE`s stop RDPRU speculation
49                let [hi, lo]: [u32; 2];
50                _mm_lfence();
51                core::arch::asm!(
52                    "rdpru",
53                    out("edx") hi,
54                    out("eax") lo,
55                    in("ecx") 1u32,
56                    options(nostack, nomem, preserves_flags),
57                );
58                let ret = (u64::from(hi) << 32) | u64::from(lo);
59                _mm_lfence();
60                ret
61            } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
62                // `LFENCE`s stop RDPRU speculation. Note that MFENCE is not needed here
63                // for reasons stated in this Linux commit message:
64                // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be261ffce6f1
65                _mm_lfence();
66                let ret = _rdtsc();
67                _mm_lfence();
68                ret
69            } else if #[cfg(all(target_arch = "aarch64", target_os = "linux"))] {
70                // If a aarch64 CPU, running GNU/Linux kernel, executes following instruction,
71                // it'll *probably* panic with message "illegal instruction executed", because userspace
72                // isn't allowed to execute that instruction without installing a Linux Kernel Module.
73                //
74                // I've tested the LKM @ https://github.com/jerinjacobk/armv8_pmu_cycle_counter_el0
75                // on a Raspberry Pi 4b ( i.e. ARM Cortex-A72, running kernel version 6.5.0-1006-raspi )
76                // and it works like charm. While extending support of this library for aarch64 targets,
77                // I found https://github.com/pornin/crrl#benchmarks pretty helpful.
78                let counter: u64;
79                core::arch::asm!("dsb sy", "mrs {}, pmccntr_el0", out(reg) counter);
80                counter
81            } else if #[cfg(target_arch = "loongarch64")] {
82                let counter: u64;
83                core::arch::asm!("rdtime.d {0}, $zero", out(reg) counter);
84                counter
85            } else {
86                compile_error!(
87                    "criterion-cycles-per-byte currently works only on x86 or x86_64 or aarch64 or loongarch64."
88                );
89            }
90        }
91    }
92}
93
94impl Measurement for CyclesPerByte {
95    type Intermediate = u64;
96    type Value = u64;
97
98    #[inline]
99    fn start(&self) -> Self::Intermediate {
100        cycle_counter()
101    }
102
103    #[inline]
104    fn end(&self, i: Self::Intermediate) -> Self::Value {
105        cycle_counter().saturating_sub(i)
106    }
107
108    #[inline]
109    fn add(&self, v1: &Self::Value, v2: &Self::Value) -> Self::Value {
110        v1 + v2
111    }
112
113    #[inline]
114    fn zero(&self) -> Self::Value {
115        0
116    }
117
118    #[inline]
119    fn to_f64(&self, value: &Self::Value) -> f64 {
120        *value as f64
121    }
122
123    fn formatter(&self) -> &dyn ValueFormatter {
124        &CyclesPerByteFormatter
125    }
126}
127
128struct CyclesPerByteFormatter;
129
130impl ValueFormatter for CyclesPerByteFormatter {
131    fn format_value(&self, value: f64) -> String {
132        format!("{:.4} cycles", value)
133    }
134
135    fn format_throughput(&self, throughput: &Throughput, value: f64) -> String {
136        match throughput {
137            Throughput::Bytes(b) => format!("{:.4} cpb", value / *b as f64),
138            Throughput::Elements(b) => format!("{:.4} cycles/{}", value, b),
139            Throughput::BytesDecimal(b) => format!("{:.4} cpb (decimal)", value / *b as f64),
140        }
141    }
142
143    fn scale_values(&self, _typical_value: f64, _values: &mut [f64]) -> &'static str {
144        "cycles"
145    }
146
147    fn scale_throughputs(
148        &self,
149        _typical_value: f64,
150        throughput: &Throughput,
151        values: &mut [f64],
152    ) -> &'static str {
153        match throughput {
154            Throughput::Bytes(n) => {
155                for val in values {
156                    *val /= *n as f64;
157                }
158                "cpb"
159            }
160            Throughput::Elements(n) => {
161                for val in values {
162                    *val /= *n as f64;
163                }
164                "c/e"
165            }
166            Throughput::BytesDecimal(n) => {
167                for val in values {
168                    *val /= *n as f64;
169                }
170                "cpb (decimal)"
171            }
172        }
173    }
174
175    fn scale_for_machines(&self, _values: &mut [f64]) -> &'static str {
176        "cycles"
177    }
178}