matchers/lib.rs
1//! Regex matchers on character and byte streams.
2//!
3//! ## Overview
4//!
5//! The [`regex`] crate implements regular expression matching on strings and byte
6//! arrays. However, in order to match the output of implementations of `fmt::Debug`
7//! and `fmt::Display`, or by any code which writes to an instance of `fmt::Write`
8//! or `io::Write`, it is necessary to first allocate a buffer, write to that
9//! buffer, and then match the buffer against a regex.
10//!
11//! In cases where it is not necessary to extract substrings, but only to test whether
12//! or not output matches a regex, it is not strictly necessary to allocate and
13//! write this output to a buffer. This crate provides a simple interface on top of
14//! the lower-level [`regex-automata`] library that implements `fmt::Write` and
15//! `io::Write` for regex patterns. This may be used to test whether streaming
16//! output matches a pattern without buffering that output.
17//!
18//! Users who need to extract substrings based on a pattern or who already have
19//! buffered data should probably use the [`regex`] crate instead.
20//!
21//! ## Syntax
22//!
23//! This crate uses the same [regex syntax][syntax] of the `regex-automata` crate.
24//!
25//! [`regex`]: https://crates.io/crates/regex
26//! [`regex-automata`]: https://crates.io/crates/regex-automata
27//! [syntax]: https://docs.rs/regex-automata/0.4.3/regex_automata/#syntax
28
29use std::{fmt, io, str::FromStr};
30
31pub use regex_automata::dfa::dense::BuildError;
32use regex_automata::dfa::dense::DFA;
33use regex_automata::dfa::Automaton;
34use regex_automata::util::primitives::StateID;
35use regex_automata::Anchored;
36
37/// A compiled match pattern that can match multipe inputs, or return a
38/// [`Matcher`] that matches a single input.
39///
40/// [`Matcher`]: ../struct.Matcher.html
41#[derive(Debug, Clone)]
42pub struct Pattern<A = DFA<Vec<u32>>> {
43 automaton: A,
44 anchored: Anchored,
45}
46
47/// A reference to a [`Pattern`] that matches a single input.
48///
49/// [`Pattern`]: ../struct.Pattern.html
50#[derive(Debug, Clone)]
51pub struct Matcher<A = DFA<Vec<u32>>> {
52 automaton: A,
53 state: StateID,
54}
55
56// === impl Pattern ===
57
58impl Pattern {
59 /// Returns a new `Pattern` for the given regex, or an error if the regex
60 /// was invalid.
61 ///
62 /// The returned `Pattern` will match occurances of the pattern which start
63 /// at *any* in a byte or character stream — the pattern may be preceded by
64 /// any number of non-matching characters. Essentially, it will behave as
65 /// though the regular expression started with a `.*?`, which enables a
66 /// match to appear anywhere. If this is not the desired behavior, use
67 /// [`Pattern::new_anchored`] instead.
68 ///
69 /// For example:
70 /// ```
71 /// use matchers::Pattern;
72 ///
73 /// // This pattern matches any number of `a`s followed by a `b`.
74 /// let pattern = Pattern::new("a+b").expect("regex is not invalid");
75 ///
76 /// // Of course, the pattern matches an input where the entire sequence of
77 /// // characters matches the pattern:
78 /// assert!(pattern.display_matches(&"aaaaab"));
79 ///
80 /// // And, since the pattern is unanchored, it will also match the
81 /// // sequence when it's followed by non-matching characters:
82 /// assert!(pattern.display_matches(&"hello world! aaaaab"));
83 /// ```
84 pub fn new(pattern: &str) -> Result<Self, BuildError> {
85 let automaton = DFA::new(pattern)?;
86 Ok(Pattern {
87 automaton,
88 anchored: Anchored::No,
89 })
90 }
91
92 /// Returns a new `Pattern` anchored at the beginning of the input stream,
93 /// or an error if the regex was invalid.
94 ///
95 /// The returned `Pattern` will *only* match an occurence of the pattern in
96 /// an input sequence if the first character or byte in the input matches
97 /// the pattern. If this is not the desired behavior, use [`Pattern::new`]
98 /// instead.
99 ///
100 /// For example:
101 /// ```
102 /// use matchers::Pattern;
103 ///
104 /// // This pattern matches any number of `a`s followed by a `b`.
105 /// let pattern = Pattern::new_anchored("a+b")
106 /// .expect("regex is not invalid");
107 ///
108 /// // The pattern matches an input where the entire sequence of
109 /// // characters matches the pattern:
110 /// assert!(pattern.display_matches(&"aaaaab"));
111 ///
112 /// // Since the pattern is anchored, it will *not* match an input that
113 /// // begins with non-matching characters:
114 /// assert!(!pattern.display_matches(&"hello world! aaaaab"));
115 ///
116 /// // ...however, if we create a pattern beginning with `.*?`, it will:
117 /// let pattern2 = Pattern::new_anchored(".*?a+b")
118 /// .expect("regex is not invalid");
119 /// assert!(pattern2.display_matches(&"hello world! aaaaab"));
120 /// ```
121 pub fn new_anchored(pattern: &str) -> Result<Self, BuildError> {
122 let automaton = DFA::new(pattern)?;
123 Ok(Pattern {
124 automaton,
125 anchored: Anchored::Yes,
126 })
127 }
128}
129
130impl FromStr for Pattern {
131 type Err = BuildError;
132 fn from_str(s: &str) -> Result<Self, Self::Err> {
133 Self::new(s)
134 }
135}
136
137impl<A: Automaton> Pattern<A> {
138 /// Obtains a `matcher` for this pattern.
139 ///
140 /// This conversion is useful when wanting to incrementally feed input (via
141 /// `io::Write`/`fmt::Write` to a matcher). Otherwise, the convenience methods on Pattern
142 /// suffice.
143 pub fn matcher(&self) -> Matcher<&'_ A> {
144 let config = regex_automata::util::start::Config::new().anchored(self.anchored);
145 Matcher {
146 automaton: &self.automaton,
147 state: self.automaton.start_state(&config).unwrap(),
148 }
149 }
150
151 /// Returns `true` if this pattern matches the given string.
152 #[inline]
153 pub fn matches(&self, s: &impl AsRef<str>) -> bool {
154 self.matcher().matches(s)
155 }
156
157 /// Returns `true` if this pattern matches the formatted output of the given
158 /// type implementing `fmt::Debug`.
159 ///
160 /// For example:
161 /// ```rust
162 /// use matchers::Pattern;
163 ///
164 /// #[derive(Debug)]
165 /// pub struct Hello {
166 /// to: &'static str,
167 /// }
168 ///
169 /// let pattern = Pattern::new(r#"Hello \{ to: "W[^"]*" \}"#).unwrap();
170 ///
171 /// let hello_world = Hello { to: "World" };
172 /// assert!(pattern.debug_matches(&hello_world));
173 ///
174 /// let hello_sf = Hello { to: "San Francisco" };
175 /// assert_eq!(pattern.debug_matches(&hello_sf), false);
176 ///
177 /// let hello_washington = Hello { to: "Washington" };
178 /// assert!(pattern.debug_matches(&hello_washington));
179 /// ```
180 #[inline]
181 pub fn debug_matches(&self, d: &impl fmt::Debug) -> bool {
182 self.matcher().debug_matches(d)
183 }
184
185 /// Returns `true` if this pattern matches the formatted output of the given
186 /// type implementing `fmt::Display`.
187 ///
188 /// For example:
189 /// ```rust
190 /// # use std::fmt;
191 /// use matchers::Pattern;
192 ///
193 /// #[derive(Debug)]
194 /// pub struct Hello {
195 /// to: &'static str,
196 /// }
197 ///
198 /// impl fmt::Display for Hello {
199 /// fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
200 /// write!(f, "Hello {}", self.to)
201 /// }
202 /// }
203 ///
204 /// let pattern = Pattern::new("Hello [Ww].+").unwrap();
205 ///
206 /// let hello_world = Hello { to: "world" };
207 /// assert!(pattern.display_matches(&hello_world));
208 /// assert_eq!(pattern.debug_matches(&hello_world), false);
209 ///
210 /// let hello_sf = Hello { to: "San Francisco" };
211 /// assert_eq!(pattern.display_matches(&hello_sf), false);
212 ///
213 /// let hello_washington = Hello { to: "Washington" };
214 /// assert!(pattern.display_matches(&hello_washington));
215 /// ```
216 #[inline]
217 pub fn display_matches(&self, d: &impl fmt::Display) -> bool {
218 self.matcher().display_matches(d)
219 }
220
221 /// Returns either a `bool` indicating whether or not this pattern matches the
222 /// data read from the provided `io::Read` stream, or an `io::Error` if an
223 /// error occurred reading from the stream.
224 #[inline]
225 pub fn read_matches(&self, io: impl io::Read) -> io::Result<bool> {
226 self.matcher().read_matches(io)
227 }
228}
229
230// === impl Matcher ===
231
232impl<A> Matcher<A>
233where
234 A: Automaton,
235{
236 #[inline]
237 fn advance(&mut self, input: u8) {
238 // It's safe to call `next_state_unchecked` since the matcher may
239 // only be constructed by a `Pattern`, which, in turn, can only be
240 // constructed with a valid DFA.
241 self.state = unsafe { self.automaton.next_state_unchecked(self.state, input) };
242 }
243
244 /// Returns `true` if this `Matcher` has matched any input that has been
245 /// provided.
246 #[inline]
247 pub fn is_matched(&self) -> bool {
248 let eoi_state = self.automaton.next_eoi_state(self.state);
249 self.automaton.is_match_state(eoi_state)
250 }
251
252 /// Returns `true` if this pattern matches the formatted output of the given
253 /// type implementing `fmt::Debug`.
254 pub fn matches(mut self, s: &impl AsRef<str>) -> bool {
255 for &byte in s.as_ref().as_bytes() {
256 self.advance(byte);
257 if self.automaton.is_dead_state(self.state) {
258 return false;
259 }
260 }
261 self.is_matched()
262 }
263
264 /// Returns `true` if this pattern matches the formatted output of the given
265 /// type implementing `fmt::Debug`.
266 pub fn debug_matches(mut self, d: &impl fmt::Debug) -> bool {
267 use std::fmt::Write;
268 write!(&mut self, "{:?}", d).expect("matcher write impl should not fail");
269 self.is_matched()
270 }
271
272 /// Returns `true` if this pattern matches the formatted output of the given
273 /// type implementing `fmt::Display`.
274 pub fn display_matches(mut self, d: &impl fmt::Display) -> bool {
275 use std::fmt::Write;
276 write!(&mut self, "{}", d).expect("matcher write impl should not fail");
277 self.is_matched()
278 }
279
280 /// Returns either a `bool` indicating whether or not this pattern matches the
281 /// data read from the provided `io::Read` stream, or an `io::Error` if an
282 /// error occurred reading from the stream.
283 pub fn read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool> {
284 for r in io.bytes() {
285 self.advance(r?);
286 if self.automaton.is_dead_state(self.state) {
287 return Ok(false);
288 }
289 }
290 Ok(self.is_matched())
291 }
292}
293
294impl<A: Automaton> fmt::Write for Matcher<A> {
295 fn write_str(&mut self, s: &str) -> fmt::Result {
296 for &byte in s.as_bytes() {
297 self.advance(byte);
298 if self.automaton.is_dead_state(self.state) {
299 break;
300 }
301 }
302 Ok(())
303 }
304}
305
306impl<A: Automaton> io::Write for Matcher<A> {
307 fn write(&mut self, bytes: &[u8]) -> Result<usize, io::Error> {
308 let mut i = 0;
309 for &byte in bytes {
310 self.advance(byte);
311 i += 1;
312 if self.automaton.is_dead_state(self.state) {
313 break;
314 }
315 }
316 Ok(i)
317 }
318
319 fn flush(&mut self) -> Result<(), io::Error> {
320 Ok(())
321 }
322}
323
324#[cfg(test)]
325mod test {
326 use super::*;
327
328 struct Str<'a>(&'a str);
329 struct ReadStr<'a>(io::Cursor<&'a [u8]>);
330
331 impl<'a> fmt::Debug for Str<'a> {
332 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
333 write!(f, "{}", self.0)
334 }
335 }
336
337 impl<'a> fmt::Display for Str<'a> {
338 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
339 write!(f, "{}", self.0)
340 }
341 }
342
343 impl<'a> io::Read for ReadStr<'a> {
344 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
345 self.0.read(buf)
346 }
347 }
348
349 impl Str<'static> {
350 fn hello_world() -> Self {
351 Self::new("hello world")
352 }
353 }
354
355 impl<'a> Str<'a> {
356 fn new(s: &'a str) -> Self {
357 Str(s)
358 }
359
360 fn to_reader(self) -> ReadStr<'a> {
361 ReadStr(io::Cursor::new(self.0.as_bytes()))
362 }
363 }
364
365 fn test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
366 let pat = new_pattern("hello world").unwrap();
367 assert!(pat.debug_matches(&Str::hello_world()));
368
369 let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
370 assert!(pat.debug_matches(&Str::hello_world()));
371
372 let pat = new_pattern("goodbye world").unwrap();
373 assert_eq!(pat.debug_matches(&Str::hello_world()), false);
374 }
375
376 fn test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
377 let pat = new_pattern("hello world").unwrap();
378 assert!(pat.display_matches(&Str::hello_world()));
379
380 let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
381 assert!(pat.display_matches(&Str::hello_world()));
382
383 let pat = new_pattern("goodbye world").unwrap();
384 assert_eq!(pat.display_matches(&Str::hello_world()), false);
385 }
386
387 fn test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
388 let pat = new_pattern("hello world").unwrap();
389 assert!(pat
390 .read_matches(Str::hello_world().to_reader())
391 .expect("no io error should occur"));
392
393 let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
394 assert!(pat
395 .read_matches(Str::hello_world().to_reader())
396 .expect("no io error should occur"));
397
398 let pat = new_pattern("goodbye world").unwrap();
399 assert_eq!(
400 pat.read_matches(Str::hello_world().to_reader())
401 .expect("no io error should occur"),
402 false
403 );
404 }
405
406 fn test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, BuildError>) {
407 let pat = new_pattern("a+b").unwrap();
408 assert!(pat.debug_matches(&Str::new("ab")));
409 assert!(pat.debug_matches(&Str::new("aaaab")));
410 assert!(pat.debug_matches(&Str::new("aaaaaaaaaab")));
411 assert_eq!(pat.debug_matches(&Str::new("b")), false);
412 assert_eq!(pat.debug_matches(&Str::new("abb")), false);
413 assert_eq!(pat.debug_matches(&Str::new("aaaaabb")), false);
414 }
415
416 mod anchored {
417 use super::*;
418 #[test]
419 fn debug_matches() {
420 test_debug_matches(Pattern::new_anchored)
421 }
422
423 #[test]
424 fn display_matches() {
425 test_display_matches(Pattern::new_anchored)
426 }
427
428 #[test]
429 fn reader_matches() {
430 test_reader_matches(Pattern::new_anchored)
431 }
432
433 #[test]
434 fn debug_rep_patterns() {
435 test_debug_rep_patterns(Pattern::new_anchored)
436 }
437
438 // === anchored behavior =============================================
439 // Tests that anchored patterns match each input type only beginning at
440 // the first character.
441 fn test_is_anchored(f: impl Fn(&Pattern, Str) -> bool) {
442 let pat = Pattern::new_anchored("a+b").unwrap();
443 assert!(f(&pat, Str::new("ab")));
444 assert!(f(&pat, Str::new("aaaab")));
445 assert!(f(&pat, Str::new("aaaaaaaaaab")));
446 assert!(!f(&pat, Str::new("bab")));
447 assert!(!f(&pat, Str::new("ffab")));
448 assert!(!f(&pat, Str::new("qqqqqqqaaaaab")));
449 }
450
451 #[test]
452 fn debug_is_anchored() {
453 test_is_anchored(|pat, input| pat.debug_matches(&input))
454 }
455
456 #[test]
457 fn display_is_anchored() {
458 test_is_anchored(|pat, input| pat.display_matches(&input));
459 }
460
461 #[test]
462 fn reader_is_anchored() {
463 test_is_anchored(|pat, input| {
464 pat.read_matches(input.to_reader())
465 .expect("no io error occurs")
466 });
467 }
468
469 // === explicitly unanchored =========================================
470 // Tests that if an "anchored" pattern begins with `.*?`, it matches as
471 // though it was unanchored.
472 fn test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
473 let pat = Pattern::new_anchored(".*?a+b").unwrap();
474 assert!(f(&pat, Str::new("ab")));
475 assert!(f(&pat, Str::new("aaaab")));
476 assert!(f(&pat, Str::new("aaaaaaaaaab")));
477 assert!(f(&pat, Str::new("bab")));
478 assert!(f(&pat, Str::new("ffab")));
479 assert!(f(&pat, Str::new("qqqqqqqaaaaab")));
480 }
481
482 #[test]
483 fn debug_explicitly_unanchored() {
484 test_explicitly_unanchored(|pat, input| pat.debug_matches(&input))
485 }
486
487 #[test]
488 fn display_explicitly_unanchored() {
489 test_explicitly_unanchored(|pat, input| pat.display_matches(&input));
490 }
491
492 #[test]
493 fn reader_explicitly_unanchored() {
494 test_explicitly_unanchored(|pat, input| {
495 pat.read_matches(input.to_reader())
496 .expect("no io error occurs")
497 });
498 }
499 }
500
501 mod unanchored {
502 use super::*;
503 #[test]
504 fn debug_matches() {
505 test_debug_matches(Pattern::new)
506 }
507
508 #[test]
509 fn display_matches() {
510 test_display_matches(Pattern::new)
511 }
512
513 #[test]
514 fn reader_matches() {
515 test_reader_matches(Pattern::new)
516 }
517
518 #[test]
519 fn debug_rep_patterns() {
520 test_debug_rep_patterns(Pattern::new)
521 }
522
523 // === anchored behavior =============================================
524 // Tests that unanchored patterns match anywhere in the input stream.
525 fn test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
526 let pat = Pattern::new("a+b").unwrap();
527 assert!(f(&pat, Str::new("ab")));
528 assert!(f(&pat, Str::new("aaaab")));
529 assert!(f(&pat, Str::new("aaaaaaaaaab")));
530 assert!(f(&pat, Str::new("bab")));
531 assert!(f(&pat, Str::new("ffab")));
532 assert!(f(&pat, Str::new("qqqfqqqqaaaaab")));
533 }
534
535 #[test]
536 fn debug_is_unanchored() {
537 test_is_unanchored(|pat, input| pat.debug_matches(&input))
538 }
539
540 #[test]
541 fn display_is_unanchored() {
542 test_is_unanchored(|pat, input| pat.display_matches(&input));
543 }
544
545 #[test]
546 fn reader_is_unanchored() {
547 test_is_unanchored(|pat, input| {
548 pat.read_matches(input.to_reader())
549 .expect("no io error occurs")
550 });
551 }
552 }
553}