criterion_cycles_per_byte/lib.rs
1//! `CyclesPerByte` measures clock cycles using the CPU read time-stamp counter instruction.
2//!
3//! ```rust
4//! # fn fibonacci_slow(_: usize) {}
5//! # fn fibonacci_fast(_: usize) {}
6//! use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
7//! use criterion_cycles_per_byte::CyclesPerByte;
8//!
9//! fn bench(c: &mut Criterion<CyclesPerByte>) {
10//! let mut group = c.benchmark_group("fibonacci");
11//!
12//! for i in 0..20 {
13//! group.bench_function(BenchmarkId::new("slow", i), |b| b.iter(|| fibonacci_slow(i)));
14//! group.bench_function(BenchmarkId::new("fast", i), |b| b.iter(|| fibonacci_fast(i)));
15//! }
16//!
17//! group.finish()
18//! }
19//!
20//! criterion_group!(
21//! name = my_bench;
22//! config = Criterion::default().with_measurement(CyclesPerByte);
23//! targets = bench
24//! );
25//! criterion_main!(my_bench);
26//! ```
27
28use criterion::{
29 measurement::{Measurement, ValueFormatter},
30 Throughput,
31};
32
33/// `CyclesPerByte` measures clock cycles using the CPU read time-stamp counter instruction. `cpb` is
34/// the preferred measurement for cryptographic algorithms.
35pub struct CyclesPerByte;
36
37// WARN: does not check for the cpu feature; but we'd panic anyway so...
38#[inline(always)]
39fn cycle_counter() -> u64 {
40 #[cfg(target_arch = "x86")]
41 use core::arch::x86::*;
42 #[cfg(target_arch = "x86_64")]
43 use core::arch::x86_64::*;
44
45 unsafe {
46 cfg_if::cfg_if! {
47 if #[cfg(all(rdpru, any(target_arch = "x86_64", target_arch = "x86")))] {
48 // `LFENCE`s stop RDPRU speculation
49 let [hi, lo]: [u32; 2];
50 _mm_lfence();
51 core::arch::asm!(
52 "rdpru",
53 out("edx") hi,
54 out("eax") lo,
55 in("ecx") 1u32,
56 options(nostack, nomem, preserves_flags),
57 );
58 let ret = (u64::from(hi) << 32) | u64::from(lo);
59 _mm_lfence();
60 ret
61 } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
62 // `LFENCE`s stop RDPRU speculation. Note that MFENCE is not needed here
63 // for reasons stated in this Linux commit message:
64 // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be261ffce6f1
65 _mm_lfence();
66 let ret = _rdtsc();
67 _mm_lfence();
68 ret
69 } else if #[cfg(all(target_arch = "aarch64", target_os = "linux"))] {
70 // If a aarch64 CPU, running GNU/Linux kernel, executes following instruction,
71 // it'll *probably* panic with message "illegal instruction executed", because userspace
72 // isn't allowed to execute that instruction without installing a Linux Kernel Module.
73 //
74 // I've tested the LKM @ https://github.com/jerinjacobk/armv8_pmu_cycle_counter_el0
75 // on a Raspberry Pi 4b ( i.e. ARM Cortex-A72, running kernel version 6.5.0-1006-raspi )
76 // and it works like charm. While extending support of this library for aarch64 targets,
77 // I found https://github.com/pornin/crrl#benchmarks pretty helpful.
78 let counter: u64;
79 core::arch::asm!("dsb sy", "mrs {}, pmccntr_el0", out(reg) counter);
80 counter
81 } else if #[cfg(target_arch = "loongarch64")] {
82 let counter: u64;
83 core::arch::asm!("rdtime.d {0}, $zero", out(reg) counter);
84 counter
85 } else {
86 compile_error!(
87 "criterion-cycles-per-byte currently works only on x86 or x86_64 or aarch64 or loongarch64."
88 );
89 }
90 }
91 }
92}
93
94impl Measurement for CyclesPerByte {
95 type Intermediate = u64;
96 type Value = u64;
97
98 #[inline]
99 fn start(&self) -> Self::Intermediate {
100 cycle_counter()
101 }
102
103 #[inline]
104 fn end(&self, i: Self::Intermediate) -> Self::Value {
105 cycle_counter().saturating_sub(i)
106 }
107
108 #[inline]
109 fn add(&self, v1: &Self::Value, v2: &Self::Value) -> Self::Value {
110 v1 + v2
111 }
112
113 #[inline]
114 fn zero(&self) -> Self::Value {
115 0
116 }
117
118 #[inline]
119 fn to_f64(&self, value: &Self::Value) -> f64 {
120 *value as f64
121 }
122
123 fn formatter(&self) -> &dyn ValueFormatter {
124 &CyclesPerByteFormatter
125 }
126}
127
128struct CyclesPerByteFormatter;
129
130impl ValueFormatter for CyclesPerByteFormatter {
131 fn format_value(&self, value: f64) -> String {
132 format!("{:.4} cycles", value)
133 }
134
135 fn format_throughput(&self, throughput: &Throughput, value: f64) -> String {
136 match throughput {
137 Throughput::Bytes(b) => format!("{:.4} cpb", value / *b as f64),
138 Throughput::Elements(b) => format!("{:.4} cycles/{}", value, b),
139 Throughput::BytesDecimal(b) => format!("{:.4} cpb (decimal)", value / *b as f64),
140 }
141 }
142
143 fn scale_values(&self, _typical_value: f64, _values: &mut [f64]) -> &'static str {
144 "cycles"
145 }
146
147 fn scale_throughputs(
148 &self,
149 _typical_value: f64,
150 throughput: &Throughput,
151 values: &mut [f64],
152 ) -> &'static str {
153 match throughput {
154 Throughput::Bytes(n) => {
155 for val in values {
156 *val /= *n as f64;
157 }
158 "cpb"
159 }
160 Throughput::Elements(n) => {
161 for val in values {
162 *val /= *n as f64;
163 }
164 "c/e"
165 }
166 Throughput::BytesDecimal(n) => {
167 for val in values {
168 *val /= *n as f64;
169 }
170 "cpb (decimal)"
171 }
172 }
173 }
174
175 fn scale_for_machines(&self, _values: &mut [f64]) -> &'static str {
176 "cycles"
177 }
178}