const_format/__ascii_case_conv/
word_iterator.rs

1use core::fmt::{self, Debug};
2
3macro_rules! for_range_inc {
4    ($current:ident in $start:expr, $end:expr => $($code:tt)*) => {
5        let mut $current = $start;
6        let end = $end;
7
8        while $current <= end {
9            $($code)*
10
11            $current+=1;
12        }
13    };
14}
15
16use core::ops::Range;
17
18#[derive(Copy, Clone)]
19struct ByteKind(u8);
20
21impl Debug for ByteKind {
22    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23        f.write_str(match () {
24            _ if self.0 == Self::Other.0 => "Other",
25            _ if self.0 == Self::Number.0 => "Number",
26            _ if self.0 == Self::LowerCase.0 => "LowerCase",
27            _ if self.0 == Self::UpperCase.0 => "UpperCase",
28            _ if self.0 == Self::NonAscii.0 => "NonAscii",
29            _ => unreachable!(),
30        })
31    }
32}
33
34#[allow(non_upper_case_globals)]
35impl ByteKind {
36    const Other: Self = Self(0b0001);
37    const Number: Self = Self(0b0010);
38    const LowerCase: Self = Self(0b0100);
39    const UpperCase: Self = Self(0b1000);
40    const Alphabetic: Self = Self(Self::LowerCase.0 | Self::UpperCase.0);
41    // Assumes that non-ascii chars are mostly alphabetic,
42    // this should work out fine most of the time.
43    const NonAscii: Self = Self(0b1100);
44}
45
46impl ByteKind {
47    #[allow(dead_code)]
48    #[inline(always)]
49    pub const fn eq(self, other: Self) -> bool {
50        (self.0 & other.0) != 0
51    }
52
53    #[inline(always)]
54    pub const fn ne(self, other: Self) -> bool {
55        (self.0 & other.0) == 0
56    }
57
58    #[inline(always)]
59    pub const fn is_alphabetic(self) -> bool {
60        self.0 == Self::LowerCase.0 || self.0 == Self::UpperCase.0
61    }
62
63    pub const fn is_end_of_word(mut self, prev: Self, other: Self) -> bool {
64        if self.0 == Self::NonAscii.0 {
65            self = prev;
66        }
67
68        if self.0 == Self::UpperCase.0 {
69            other.ne(Self::Alphabetic)
70        } else {
71            self.ne(other)
72        }
73    }
74}
75
76#[derive(Debug, Copy, Clone)]
77pub(crate) struct WordIterator<'a> {
78    bytes: &'a [u8],
79    start: usize,
80}
81
82const BYTE_KIND: &[ByteKind; 256] = &{
83    let mut out = [ByteKind::NonAscii; 256];
84
85    // Make sure that this goes first
86    for_range_inc! {i in 0, 127 => out[i as usize] = ByteKind::Other; }
87    for_range_inc! {i in b'A', b'Z' => out[i as usize] = ByteKind::UpperCase; }
88    for_range_inc! {i in b'a', b'z' => out[i as usize] = ByteKind::LowerCase; }
89    for_range_inc! {i in b'0', b'9' => out[i as usize] = ByteKind::Number; }
90
91    out
92};
93
94impl<'a> WordIterator<'a> {
95    pub(crate) const fn new(bytes: &'a [u8]) -> Self {
96        Self { bytes, start: 0 }
97    }
98
99    const fn skip_same_kind(mut self, mut kind: ByteKind) -> (Self, ByteKind) {
100        let orig_bytes_len = self.bytes.len();
101
102        let mut prev_kind = kind;
103        while let [b, rem @ ..] = self.bytes {
104            let next_kind = BYTE_KIND[*b as usize];
105            let cmp = kind.is_end_of_word(prev_kind, next_kind);
106            if kind.is_alphabetic() {
107                prev_kind = kind;
108            }
109            kind = next_kind;
110            if cmp {
111                break;
112            }
113            self.bytes = rem;
114        }
115
116        // Advance until a char boundary is found
117        while let [b, rem @ ..] = self.bytes {
118            if (*b as i8) >= -0x40 {
119                break;
120            }
121            self.bytes = rem;
122        }
123
124        // Remember not to add return statements to the function
125        self.start += orig_bytes_len - self.bytes.len();
126
127        (self, kind)
128    }
129
130    pub(crate) const fn next(self) -> Option<(Self, Range<usize>)> {
131        let (this, fkind) = self.skip_same_kind(ByteKind::Other);
132        if let [] = this.bytes {
133            None
134        } else {
135            let (next, _) = this.skip_same_kind(fkind);
136            let range = this.start..next.start;
137            Some((next, range))
138        }
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145
146    use arrayvec::ArrayVec;
147
148    fn get_words(text: &str) -> ArrayVec<[&str; 20]> {
149        let mut list = <ArrayVec<[&str; 20]>>::new();
150        let mut word_iter = WordIterator::new(text.as_bytes());
151
152        while let Some((niter, word_range)) = word_iter.next() {
153            word_iter = niter;
154            list.push(&text[word_range]);
155        }
156
157        list
158    }
159
160    #[test]
161    fn test_word_iter() {
162        assert_eq!(
163            get_words("01934324ñmaniÑNnFooBar")[..],
164            ["01934324", "ñmaniÑ", "Nn", "Foo", "Bar"],
165        );
166
167        assert_eq!(
168            get_words("01934 324  ñmani-嶲Nn____FOOOBar")[..],
169            ["01934", "324", "ñmani", "嶲Nn", "FOOOBar"],
170        );
171
172        assert_eq!(get_words("    01934 1111 ")[..], ["01934", "1111"],);
173
174        assert_eq!(get_words("    嶲01934 ")[..], ["嶲", "01934"],);
175
176        assert_eq!(get_words("    嶲A01934 ")[..], ["嶲A", "01934"],);
177
178        assert_eq!(get_words("    嶲a01934 ")[..], ["嶲a", "01934"],);
179
180        assert_eq!(get_words("    ñA01934 ")[..], ["ñA", "01934"],);
181
182        assert_eq!(get_words("    ña01934 ")[..], ["ña", "01934"],);
183    }
184}