matchers/
lib.rs

1//! Regex matchers on character and byte streams.
2//!
3//! ## Overview
4//!
5//! The [`regex`] crate implements regular expression matching on strings and byte
6//! arrays. However, in order to match the output of implementations of `fmt::Debug`
7//! and `fmt::Display`, or by any code which writes to an instance of `fmt::Write`
8//! or `io::Write`, it is necessary to first allocate a buffer, write to that
9//! buffer, and then match the buffer against a regex.
10//!
11//! In cases where it is not necessary to extract substrings, but only to test whether
12//! or not output matches a regex, it is not strictly necessary to allocate and
13//! write this output to a buffer. This crate provides a simple interface on top of
14//! the lower-level [`regex-automata`] library that implements `fmt::Write` and
15//! `io::Write` for regex patterns. This may be used to test whether streaming
16//! output matches a pattern without buffering that output.
17//!
18//! Users who need to extract substrings based on a pattern or who already have
19//! buffered data should probably use the [`regex`] crate instead.
20//!
21//! ## Syntax
22//!
23//! This crate uses the same [regex syntax][syntax] of the `regex-automata` crate.
24//!
25//! [`regex`]: https://crates.io/crates/regex
26//! [`regex-automata`]: https://crates.io/crates/regex-automata
27//! [syntax]: https://docs.rs/regex-automata/0.4.3/regex_automata/#syntax
28
29use std::{fmt, io, str::FromStr};
30
31pub use regex_automata::dfa::dense::BuildError;
32use regex_automata::dfa::dense::DFA;
33use regex_automata::dfa::Automaton;
34use regex_automata::util::primitives::StateID;
35use regex_automata::Anchored;
36
37/// A compiled match pattern that can match multipe inputs, or return a
38/// [`Matcher`] that matches a single input.
39///
40/// [`Matcher`]: ../struct.Matcher.html
41#[derive(Debug, Clone)]
42pub struct Pattern<A = DFA<Vec<u32>>> {
43    automaton: A,
44    anchored: Anchored,
45}
46
47/// A reference to a [`Pattern`] that matches a single input.
48///
49/// [`Pattern`]: ../struct.Pattern.html
50#[derive(Debug, Clone)]
51pub struct Matcher<A = DFA<Vec<u32>>> {
52    automaton: A,
53    state: StateID,
54}
55
56// === impl Pattern ===
57
58impl Pattern {
59    /// Returns a new `Pattern` for the given regex, or an error if the regex
60    /// was invalid.
61    ///
62    /// The returned `Pattern` will match occurances of the pattern which start
63    /// at *any* in a byte or character stream — the pattern may be preceded by
64    /// any number of non-matching characters. Essentially, it will behave as
65    /// though the regular expression started with a `.*?`, which enables a
66    /// match to appear anywhere. If this is not the desired behavior, use
67    /// [`Pattern::new_anchored`] instead.
68    ///
69    /// For example:
70    /// ```
71    /// use matchers::Pattern;
72    ///
73    /// // This pattern matches any number of `a`s followed by a `b`.
74    /// let pattern = Pattern::new("a+b").expect("regex is not invalid");
75    ///
76    /// // Of course, the pattern matches an input where the entire sequence of
77    /// // characters matches the pattern:
78    /// assert!(pattern.display_matches(&"aaaaab"));
79    ///
80    /// // And, since the pattern is unanchored, it will also match the
81    /// // sequence when it's followed by non-matching characters:
82    /// assert!(pattern.display_matches(&"hello world! aaaaab"));
83    /// ```
84    pub fn new(pattern: &str) -> Result<Self, BuildError> {
85        let automaton = DFA::new(pattern)?;
86        Ok(Pattern {
87            automaton,
88            anchored: Anchored::No,
89        })
90    }
91
92    /// Returns a new `Pattern` anchored at the beginning of the input stream,
93    /// or an error if the regex was invalid.
94    ///
95    /// The returned `Pattern` will *only* match an occurence of the pattern in
96    /// an input sequence if the first character or byte in the input matches
97    /// the pattern. If this is not the desired behavior, use [`Pattern::new`]
98    /// instead.
99    ///
100    /// For example:
101    /// ```
102    /// use matchers::Pattern;
103    ///
104    /// // This pattern matches any number of `a`s followed by a `b`.
105    /// let pattern = Pattern::new_anchored("a+b")
106    ///     .expect("regex is not invalid");
107    ///
108    /// // The pattern matches an input where the entire sequence of
109    /// // characters matches the pattern:
110    /// assert!(pattern.display_matches(&"aaaaab"));
111    ///
112    /// // Since the pattern is anchored, it will *not* match an input that
113    /// // begins with non-matching characters:
114    /// assert!(!pattern.display_matches(&"hello world! aaaaab"));
115    ///
116    /// // ...however, if we create a pattern beginning with `.*?`, it will:
117    /// let pattern2 = Pattern::new_anchored(".*?a+b")
118    ///     .expect("regex is not invalid");
119    /// assert!(pattern2.display_matches(&"hello world! aaaaab"));
120    /// ```
121    pub fn new_anchored(pattern: &str) -> Result<Self, BuildError> {
122        let automaton = DFA::new(pattern)?;
123        Ok(Pattern {
124            automaton,
125            anchored: Anchored::Yes,
126        })
127    }
128}
129
130impl FromStr for Pattern {
131    type Err = BuildError;
132    fn from_str(s: &str) -> Result<Self, Self::Err> {
133        Self::new(s)
134    }
135}
136
137impl<A: Automaton> Pattern<A> {
138    /// Obtains a `matcher` for this pattern.
139    ///
140    /// This conversion is useful when wanting to incrementally feed input (via
141    /// `io::Write`/`fmt::Write` to a matcher). Otherwise, the convenience methods on Pattern
142    /// suffice.
143    pub fn matcher(&self) -> Matcher<&'_ A> {
144        let config = regex_automata::util::start::Config::new().anchored(self.anchored);
145        Matcher {
146            automaton: &self.automaton,
147            state: self.automaton.start_state(&config).unwrap(),
148        }
149    }
150
151    /// Returns `true` if this pattern matches the given string.
152    #[inline]
153    pub fn matches(&self, s: &impl AsRef<str>) -> bool {
154        self.matcher().matches(s)
155    }
156
157    /// Returns `true` if this pattern matches the formatted output of the given
158    /// type implementing `fmt::Debug`.
159    ///
160    /// For example:
161    /// ```rust
162    /// use matchers::Pattern;
163    ///
164    /// #[derive(Debug)]
165    /// pub struct Hello {
166    ///     to: &'static str,
167    /// }
168    ///
169    /// let pattern = Pattern::new(r#"Hello \{ to: "W[^"]*" \}"#).unwrap();
170    ///
171    /// let hello_world = Hello { to: "World" };
172    /// assert!(pattern.debug_matches(&hello_world));
173    ///
174    /// let hello_sf = Hello { to: "San Francisco" };
175    /// assert_eq!(pattern.debug_matches(&hello_sf), false);
176    ///
177    /// let hello_washington = Hello { to: "Washington" };
178    /// assert!(pattern.debug_matches(&hello_washington));
179    /// ```
180    #[inline]
181    pub fn debug_matches(&self, d: &impl fmt::Debug) -> bool {
182        self.matcher().debug_matches(d)
183    }
184
185    /// Returns `true` if this pattern matches the formatted output of the given
186    /// type implementing `fmt::Display`.
187    ///
188    /// For example:
189    /// ```rust
190    /// # use std::fmt;
191    /// use matchers::Pattern;
192    ///
193    /// #[derive(Debug)]
194    /// pub struct Hello {
195    ///     to: &'static str,
196    /// }
197    ///
198    /// impl fmt::Display for Hello {
199    ///     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
200    ///         write!(f, "Hello {}", self.to)
201    ///     }
202    /// }
203    ///
204    /// let pattern = Pattern::new("Hello [Ww].+").unwrap();
205    ///
206    /// let hello_world = Hello { to: "world" };
207    /// assert!(pattern.display_matches(&hello_world));
208    /// assert_eq!(pattern.debug_matches(&hello_world), false);
209    ///
210    /// let hello_sf = Hello { to: "San Francisco" };
211    /// assert_eq!(pattern.display_matches(&hello_sf), false);
212    ///
213    /// let hello_washington = Hello { to: "Washington" };
214    /// assert!(pattern.display_matches(&hello_washington));
215    /// ```
216    #[inline]
217    pub fn display_matches(&self, d: &impl fmt::Display) -> bool {
218        self.matcher().display_matches(d)
219    }
220
221    /// Returns either a `bool` indicating whether or not this pattern matches the
222    /// data read from the provided `io::Read` stream, or an `io::Error` if an
223    /// error occurred reading from the stream.
224    #[inline]
225    pub fn read_matches(&self, io: impl io::Read) -> io::Result<bool> {
226        self.matcher().read_matches(io)
227    }
228}
229
230// === impl Matcher ===
231
232impl<A> Matcher<A>
233where
234    A: Automaton,
235{
236    #[inline]
237    fn advance(&mut self, input: u8) {
238        // It's safe to call `next_state_unchecked` since the matcher may
239        // only be constructed by a `Pattern`, which, in turn, can only be
240        // constructed with a valid DFA.
241        self.state = unsafe { self.automaton.next_state_unchecked(self.state, input) };
242    }
243
244    /// Returns `true` if this `Matcher` has matched any input that has been
245    /// provided.
246    #[inline]
247    pub fn is_matched(&self) -> bool {
248        let eoi_state = self.automaton.next_eoi_state(self.state);
249        self.automaton.is_match_state(eoi_state)
250    }
251
252    /// Returns `true` if this pattern matches the formatted output of the given
253    /// type implementing `fmt::Debug`.
254    pub fn matches(mut self, s: &impl AsRef<str>) -> bool {
255        for &byte in s.as_ref().as_bytes() {
256            self.advance(byte);
257            if self.automaton.is_dead_state(self.state) {
258                return false;
259            }
260        }
261        self.is_matched()
262    }
263
264    /// Returns `true` if this pattern matches the formatted output of the given
265    /// type implementing `fmt::Debug`.
266    pub fn debug_matches(mut self, d: &impl fmt::Debug) -> bool {
267        use std::fmt::Write;
268        write!(&mut self, "{:?}", d).expect("matcher write impl should not fail");
269        self.is_matched()
270    }
271
272    /// Returns `true` if this pattern matches the formatted output of the given
273    /// type implementing `fmt::Display`.
274    pub fn display_matches(mut self, d: &impl fmt::Display) -> bool {
275        use std::fmt::Write;
276        write!(&mut self, "{}", d).expect("matcher write impl should not fail");
277        self.is_matched()
278    }
279
280    /// Returns either a `bool` indicating whether or not this pattern matches the
281    /// data read from the provided `io::Read` stream, or an `io::Error` if an
282    /// error occurred reading from the stream.
283    pub fn read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool> {
284        for r in io.bytes() {
285            self.advance(r?);
286            if self.automaton.is_dead_state(self.state) {
287                return Ok(false);
288            }
289        }
290        Ok(self.is_matched())
291    }
292}
293
294impl<A: Automaton> fmt::Write for Matcher<A> {
295    fn write_str(&mut self, s: &str) -> fmt::Result {
296        for &byte in s.as_bytes() {
297            self.advance(byte);
298            if self.automaton.is_dead_state(self.state) {
299                break;
300            }
301        }
302        Ok(())
303    }
304}
305
306impl<A: Automaton> io::Write for Matcher<A> {
307    fn write(&mut self, bytes: &[u8]) -> Result<usize, io::Error> {
308        let mut i = 0;
309        for &byte in bytes {
310            self.advance(byte);
311            i += 1;
312            if self.automaton.is_dead_state(self.state) {
313                break;
314            }
315        }
316        Ok(i)
317    }
318
319    fn flush(&mut self) -> Result<(), io::Error> {
320        Ok(())
321    }
322}
323
324#[cfg(test)]
325mod test {
326    use super::*;
327
328    struct Str<'a>(&'a str);
329    struct ReadStr<'a>(io::Cursor<&'a [u8]>);
330
331    impl<'a> fmt::Debug for Str<'a> {
332        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
333            write!(f, "{}", self.0)
334        }
335    }
336
337    impl<'a> fmt::Display for Str<'a> {
338        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
339            write!(f, "{}", self.0)
340        }
341    }
342
343    impl<'a> io::Read for ReadStr<'a> {
344        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
345            self.0.read(buf)
346        }
347    }
348
349    impl Str<'static> {
350        fn hello_world() -> Self {
351            Self::new("hello world")
352        }
353    }
354
355    impl<'a> Str<'a> {
356        fn new(s: &'a str) -> Self {
357            Str(s)
358        }
359
360        fn to_reader(self) -> ReadStr<'a> {
361            ReadStr(io::Cursor::new(self.0.as_bytes()))
362        }
363    }
364
365    fn test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
366        let pat = new_pattern("hello world").unwrap();
367        assert!(pat.debug_matches(&Str::hello_world()));
368
369        let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
370        assert!(pat.debug_matches(&Str::hello_world()));
371
372        let pat = new_pattern("goodbye world").unwrap();
373        assert_eq!(pat.debug_matches(&Str::hello_world()), false);
374    }
375
376    fn test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
377        let pat = new_pattern("hello world").unwrap();
378        assert!(pat.display_matches(&Str::hello_world()));
379
380        let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
381        assert!(pat.display_matches(&Str::hello_world()));
382
383        let pat = new_pattern("goodbye world").unwrap();
384        assert_eq!(pat.display_matches(&Str::hello_world()), false);
385    }
386
387    fn test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
388        let pat = new_pattern("hello world").unwrap();
389        assert!(pat
390            .read_matches(Str::hello_world().to_reader())
391            .expect("no io error should occur"));
392
393        let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
394        assert!(pat
395            .read_matches(Str::hello_world().to_reader())
396            .expect("no io error should occur"));
397
398        let pat = new_pattern("goodbye world").unwrap();
399        assert_eq!(
400            pat.read_matches(Str::hello_world().to_reader())
401                .expect("no io error should occur"),
402            false
403        );
404    }
405
406    fn test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
407        let pat = new_pattern("a+b").unwrap();
408        assert!(pat.debug_matches(&Str::new("ab")));
409        assert!(pat.debug_matches(&Str::new("aaaab")));
410        assert!(pat.debug_matches(&Str::new("aaaaaaaaaab")));
411        assert_eq!(pat.debug_matches(&Str::new("b")), false);
412        assert_eq!(pat.debug_matches(&Str::new("abb")), false);
413        assert_eq!(pat.debug_matches(&Str::new("aaaaabb")), false);
414    }
415
416    mod anchored {
417        use super::*;
418        #[test]
419        fn debug_matches() {
420            test_debug_matches(Pattern::new_anchored)
421        }
422
423        #[test]
424        fn display_matches() {
425            test_display_matches(Pattern::new_anchored)
426        }
427
428        #[test]
429        fn reader_matches() {
430            test_reader_matches(Pattern::new_anchored)
431        }
432
433        #[test]
434        fn debug_rep_patterns() {
435            test_debug_rep_patterns(Pattern::new_anchored)
436        }
437
438        // === anchored behavior =============================================
439        // Tests that anchored patterns match each input type only beginning at
440        // the first character.
441        fn test_is_anchored(f: impl Fn(&Pattern, Str) -> bool) {
442            let pat = Pattern::new_anchored("a+b").unwrap();
443            assert!(f(&pat, Str::new("ab")));
444            assert!(f(&pat, Str::new("aaaab")));
445            assert!(f(&pat, Str::new("aaaaaaaaaab")));
446            assert!(!f(&pat, Str::new("bab")));
447            assert!(!f(&pat, Str::new("ffab")));
448            assert!(!f(&pat, Str::new("qqqqqqqaaaaab")));
449        }
450
451        #[test]
452        fn debug_is_anchored() {
453            test_is_anchored(|pat, input| pat.debug_matches(&input))
454        }
455
456        #[test]
457        fn display_is_anchored() {
458            test_is_anchored(|pat, input| pat.display_matches(&input));
459        }
460
461        #[test]
462        fn reader_is_anchored() {
463            test_is_anchored(|pat, input| {
464                pat.read_matches(input.to_reader())
465                    .expect("no io error occurs")
466            });
467        }
468
469        // === explicitly unanchored =========================================
470        // Tests that if an "anchored" pattern begins with `.*?`, it matches as
471        // though it was unanchored.
472        fn test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
473            let pat = Pattern::new_anchored(".*?a+b").unwrap();
474            assert!(f(&pat, Str::new("ab")));
475            assert!(f(&pat, Str::new("aaaab")));
476            assert!(f(&pat, Str::new("aaaaaaaaaab")));
477            assert!(f(&pat, Str::new("bab")));
478            assert!(f(&pat, Str::new("ffab")));
479            assert!(f(&pat, Str::new("qqqqqqqaaaaab")));
480        }
481
482        #[test]
483        fn debug_explicitly_unanchored() {
484            test_explicitly_unanchored(|pat, input| pat.debug_matches(&input))
485        }
486
487        #[test]
488        fn display_explicitly_unanchored() {
489            test_explicitly_unanchored(|pat, input| pat.display_matches(&input));
490        }
491
492        #[test]
493        fn reader_explicitly_unanchored() {
494            test_explicitly_unanchored(|pat, input| {
495                pat.read_matches(input.to_reader())
496                    .expect("no io error occurs")
497            });
498        }
499    }
500
501    mod unanchored {
502        use super::*;
503        #[test]
504        fn debug_matches() {
505            test_debug_matches(Pattern::new)
506        }
507
508        #[test]
509        fn display_matches() {
510            test_display_matches(Pattern::new)
511        }
512
513        #[test]
514        fn reader_matches() {
515            test_reader_matches(Pattern::new)
516        }
517
518        #[test]
519        fn debug_rep_patterns() {
520            test_debug_rep_patterns(Pattern::new)
521        }
522
523        // === anchored behavior =============================================
524        // Tests that unanchored patterns match anywhere in the input stream.
525        fn test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
526            let pat = Pattern::new("a+b").unwrap();
527            assert!(f(&pat, Str::new("ab")));
528            assert!(f(&pat, Str::new("aaaab")));
529            assert!(f(&pat, Str::new("aaaaaaaaaab")));
530            assert!(f(&pat, Str::new("bab")));
531            assert!(f(&pat, Str::new("ffab")));
532            assert!(f(&pat, Str::new("qqqfqqqqaaaaab")));
533        }
534
535        #[test]
536        fn debug_is_unanchored() {
537            test_is_unanchored(|pat, input| pat.debug_matches(&input))
538        }
539
540        #[test]
541        fn display_is_unanchored() {
542            test_is_unanchored(|pat, input| pat.display_matches(&input));
543        }
544
545        #[test]
546        fn reader_is_unanchored() {
547            test_is_unanchored(|pat, input| {
548                pat.read_matches(input.to_reader())
549                    .expect("no io error occurs")
550            });
551        }
552    }
553}