
1use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
2use arrayref::{array_mut_ref, array_ref};
4cfg_if::cfg_if! {
5    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
6        cfg_if::cfg_if! {
7            if #[cfg(blake3_avx512_ffi)] {
8                pub const MAX_SIMD_DEGREE: usize = 16;
9            } else {
10                pub const MAX_SIMD_DEGREE: usize = 8;
11            }
12        }
13    } else if #[cfg(blake3_neon)] {
14        pub const MAX_SIMD_DEGREE: usize = 4;
15    } else {
16        pub const MAX_SIMD_DEGREE: usize = 1;
17    }
20// There are some places where we want a static size that's equal to the
21// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
22// allowed to use cmp::max, so we have to hardcode this additional constant
23// value. Get rid of this once cmp::max is a const fn.
24cfg_if::cfg_if! {
25    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
26        cfg_if::cfg_if! {
27            if #[cfg(blake3_avx512_ffi)] {
28                pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
29            } else {
30                pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
31            }
32        }
33    } else if #[cfg(blake3_neon)] {
34        pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
35    } else {
36        pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
37    }
40#[derive(Clone, Copy, Debug)]
41pub enum Platform {
42    Portable,
43    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
44    SSE2,
45    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
46    SSE41,
47    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
48    AVX2,
49    #[cfg(blake3_avx512_ffi)]
50    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
51    AVX512,
52    #[cfg(blake3_neon)]
53    NEON,
56impl Platform {
57    #[allow(unreachable_code)]
58    pub fn detect() -> Self {
59        #[cfg(miri)]
60        {
61            return Platform::Portable;
62        }
64        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
65        {
66            #[cfg(blake3_avx512_ffi)]
67            {
68                if avx512_detected() {
69                    return Platform::AVX512;
70                }
71            }
72            if avx2_detected() {
73                return Platform::AVX2;
74            }
75            if sse41_detected() {
76                return Platform::SSE41;
77            }
78            if sse2_detected() {
79                return Platform::SSE2;
80            }
81        }
82        // We don't use dynamic feature detection for NEON. If the "neon"
83        // feature is on, NEON is assumed to be supported.
84        #[cfg(blake3_neon)]
85        {
86            return Platform::NEON;
87        }
88        Platform::Portable
89    }
91    pub fn simd_degree(&self) -> usize {
92        let degree = match self {
93            Platform::Portable => 1,
94            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
95            Platform::SSE2 => 4,
96            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
97            Platform::SSE41 => 4,
98            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
99            Platform::AVX2 => 8,
100            #[cfg(blake3_avx512_ffi)]
101            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
102            Platform::AVX512 => 16,
103            #[cfg(blake3_neon)]
104            Platform::NEON => 4,
105        };
106        debug_assert!(degree <= MAX_SIMD_DEGREE);
107        degree
108    }
110    pub fn compress_in_place(
111        &self,
112        cv: &mut CVWords,
113        block: &[u8; BLOCK_LEN],
114        block_len: u8,
115        counter: u64,
116        flags: u8,
117    ) {
118        match self {
119            Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
120            // Safe because detect() checked for platform support.
121            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
122            Platform::SSE2 => unsafe {
123                crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
124            },
125            // Safe because detect() checked for platform support.
126            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
127            Platform::SSE41 | Platform::AVX2 => unsafe {
128                crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
129            },
130            // Safe because detect() checked for platform support.
131            #[cfg(blake3_avx512_ffi)]
132            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
133            Platform::AVX512 => unsafe {
134                crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
135            },
136            // No NEON compress_in_place() implementation yet.
137            #[cfg(blake3_neon)]
138            Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
139        }
140    }
142    pub fn compress_xof(
143        &self,
144        cv: &CVWords,
145        block: &[u8; BLOCK_LEN],
146        block_len: u8,
147        counter: u64,
148        flags: u8,
149    ) -> [u8; 64] {
150        match self {
151            Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
152            // Safe because detect() checked for platform support.
153            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
154            Platform::SSE2 => unsafe {
155                crate::sse2::compress_xof(cv, block, block_len, counter, flags)
156            },
157            // Safe because detect() checked for platform support.
158            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
159            Platform::SSE41 | Platform::AVX2 => unsafe {
160                crate::sse41::compress_xof(cv, block, block_len, counter, flags)
161            },
162            // Safe because detect() checked for platform support.
163            #[cfg(blake3_avx512_ffi)]
164            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
165            Platform::AVX512 => unsafe {
166                crate::avx512::compress_xof(cv, block, block_len, counter, flags)
167            },
168            // No NEON compress_xof() implementation yet.
169            #[cfg(blake3_neon)]
170            Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
171        }
172    }
175    // ===================
176    // hash_many() applies two optimizations. The critically important
177    // optimization is the high-performance parallel SIMD hashing mode,
178    // described in detail in the spec. This more than doubles throughput per
179    // thread. Another optimization is keeping the state vectors transposed
180    // from block to block within a chunk. When state vectors are transposed
181    // after every block, there's a small but measurable performance loss.
182    // Compressing chunks with a dedicated loop avoids this.
184    pub fn hash_many<const N: usize>(
185        &self,
186        inputs: &[&[u8; N]],
187        key: &CVWords,
188        counter: u64,
189        increment_counter: IncrementCounter,
190        flags: u8,
191        flags_start: u8,
192        flags_end: u8,
193        out: &mut [u8],
194    ) {
195        match self {
196            Platform::Portable => portable::hash_many(
197                inputs,
198                key,
199                counter,
200                increment_counter,
201                flags,
202                flags_start,
203                flags_end,
204                out,
205            ),
206            // Safe because detect() checked for platform support.
207            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
208            Platform::SSE2 => unsafe {
209                crate::sse2::hash_many(
210                    inputs,
211                    key,
212                    counter,
213                    increment_counter,
214                    flags,
215                    flags_start,
216                    flags_end,
217                    out,
218                )
219            },
220            // Safe because detect() checked for platform support.
221            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
222            Platform::SSE41 => unsafe {
223                crate::sse41::hash_many(
224                    inputs,
225                    key,
226                    counter,
227                    increment_counter,
228                    flags,
229                    flags_start,
230                    flags_end,
231                    out,
232                )
233            },
234            // Safe because detect() checked for platform support.
235            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
236            Platform::AVX2 => unsafe {
237                crate::avx2::hash_many(
238                    inputs,
239                    key,
240                    counter,
241                    increment_counter,
242                    flags,
243                    flags_start,
244                    flags_end,
245                    out,
246                )
247            },
248            // Safe because detect() checked for platform support.
249            #[cfg(blake3_avx512_ffi)]
250            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
251            Platform::AVX512 => unsafe {
252                crate::avx512::hash_many(
253                    inputs,
254                    key,
255                    counter,
256                    increment_counter,
257                    flags,
258                    flags_start,
259                    flags_end,
260                    out,
261                )
262            },
263            // Assumed to be safe if the "neon" feature is on.
264            #[cfg(blake3_neon)]
265            Platform::NEON => unsafe {
266                crate::neon::hash_many(
267                    inputs,
268                    key,
269                    counter,
270                    increment_counter,
271                    flags,
272                    flags_start,
273                    flags_end,
274                    out,
275                )
276            },
277        }
278    }
280    pub fn xof_many(
281        &self,
282        cv: &CVWords,
283        block: &[u8; BLOCK_LEN],
284        block_len: u8,
285        mut counter: u64,
286        flags: u8,
287        out: &mut [u8],
288    ) {
289        debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
290        if out.is_empty() {
291            // The current assembly implementation always outputs at least 1 block.
292            return;
293        }
294        match self {
295            // Safe because detect() checked for platform support.
296            #[cfg(blake3_avx512_ffi)]
297            #[cfg(unix)]
298            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
299            Platform::AVX512 => unsafe {
300                crate::avx512::xof_many(cv, block, block_len, counter, flags, out)
301            },
302            _ => {
303                // For platforms without an optimized xof_many, fall back to a loop over
304                // compress_xof. This is still faster than portable code.
305                for out_block in out.chunks_exact_mut(BLOCK_LEN) {
306                    // TODO: Use array_chunks_mut here once that's stable.
307                    let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap();
308                    *out_array = self.compress_xof(cv, block, block_len, counter, flags);
309                    counter += 1;
310                }
311            }
312        }
313    }
315    // Explicit platform constructors, for benchmarks.
317    pub fn portable() -> Self {
318        Self::Portable
319    }
321    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
322    pub fn sse2() -> Option<Self> {
323        if sse2_detected() {
324            Some(Self::SSE2)
325        } else {
326            None
327        }
328    }
330    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
331    pub fn sse41() -> Option<Self> {
332        if sse41_detected() {
333            Some(Self::SSE41)
334        } else {
335            None
336        }
337    }
339    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
340    pub fn avx2() -> Option<Self> {
341        if avx2_detected() {
342            Some(Self::AVX2)
343        } else {
344            None
345        }
346    }
348    #[cfg(blake3_avx512_ffi)]
349    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
350    pub fn avx512() -> Option<Self> {
351        if avx512_detected() {
352            Some(Self::AVX512)
353        } else {
354            None
355        }
356    }
358    #[cfg(blake3_neon)]
359    pub fn neon() -> Option<Self> {
360        // Assumed to be safe if the "neon" feature is on.
361        Some(Self::NEON)
362    }
365// Note that AVX-512 is divided into multiple featuresets, and we use two of
366// them, F and VL.
368#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
371pub fn avx512_detected() -> bool {
372    if cfg!(miri) {
373        return false;
374    }
376    // A testing-only short-circuit.
377    if cfg!(feature = "no_avx512") {
378        return false;
379    }
380    // Static check, e.g. for building with target-cpu=native.
381    #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
382    {
383        return true;
384    }
385    // Dynamic check, if std is enabled.
386    #[cfg(feature = "std")]
387    {
388        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
389            return true;
390        }
391    }
392    false
395#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
398pub fn avx2_detected() -> bool {
399    if cfg!(miri) {
400        return false;
401    }
403    // A testing-only short-circuit.
404    if cfg!(feature = "no_avx2") {
405        return false;
406    }
407    // Static check, e.g. for building with target-cpu=native.
408    #[cfg(target_feature = "avx2")]
409    {
410        return true;
411    }
412    // Dynamic check, if std is enabled.
413    #[cfg(feature = "std")]
414    {
415        if is_x86_feature_detected!("avx2") {
416            return true;
417        }
418    }
419    false
422#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
425pub fn sse41_detected() -> bool {
426    if cfg!(miri) {
427        return false;
428    }
430    // A testing-only short-circuit.
431    if cfg!(feature = "no_sse41") {
432        return false;
433    }
434    // Static check, e.g. for building with target-cpu=native.
435    #[cfg(target_feature = "sse4.1")]
436    {
437        return true;
438    }
439    // Dynamic check, if std is enabled.
440    #[cfg(feature = "std")]
441    {
442        if is_x86_feature_detected!("sse4.1") {
443            return true;
444        }
445    }
446    false
449#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
452pub fn sse2_detected() -> bool {
453    if cfg!(miri) {
454        return false;
455    }
457    // A testing-only short-circuit.
458    if cfg!(feature = "no_sse2") {
459        return false;
460    }
461    // Static check, e.g. for building with target-cpu=native.
462    #[cfg(target_feature = "sse2")]
463    {
464        return true;
465    }
466    // Dynamic check, if std is enabled.
467    #[cfg(feature = "std")]
468    {
469        if is_x86_feature_detected!("sse2") {
470            return true;
471        }
472    }
473    false
477pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
478    let mut out = [0; 8];
479    out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
480    out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
481    out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
482    out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
483    out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
484    out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
485    out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
486    out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
487    out
491pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
492    let mut out = [0; 16];
493    out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
494    out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
495    out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
496    out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
497    out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
498    out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
499    out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
500    out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
501    out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
502    out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
503    out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
504    out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
505    out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
506    out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
507    out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
508    out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
509    out
513pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
514    let mut out = [0; 32];
515    *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
516    *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
517    *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
518    *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
519    *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
520    *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
521    *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
522    *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
523    out
527pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
528    let mut out = [0; 64];
529    *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
530    *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
531    *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
532    *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
533    *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
534    *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
535    *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
536    *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
537    *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
538    *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
539    *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
540    *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
541    *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
542    *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
543    *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
544    *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
545    out