mime/
parse.rs

1#[allow(unused, deprecated)]
2use std::ascii::AsciiExt;
3use std::error::Error;
4use std::fmt;
5use std::iter::Enumerate;
6use std::str::Bytes;
7
8use super::{Mime, MimeIter, Source, ParamSource, Indexed, CHARSET, UTF_8};
9
10#[derive(Debug)]
11pub enum ParseError {
12    MissingSlash,
13    MissingEqual,
14    MissingQuote,
15    InvalidToken {
16        pos: usize,
17        byte: u8,
18    },
19}
20
21impl ParseError {
22    fn s(&self) -> &str {
23        use self::ParseError::*;
24
25        match *self {
26            MissingSlash => "a slash (/) was missing between the type and subtype",
27            MissingEqual => "an equals sign (=) was missing between a parameter and its value",
28            MissingQuote => "a quote (\") was missing from a parameter value",
29            InvalidToken { .. } => "an invalid token was encountered",
30        }
31    }
32}
33
34impl fmt::Display for ParseError {
35    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
36        if let ParseError::InvalidToken { pos, byte } = *self {
37            write!(f, "{}, {:X} at position {}", self.s(), byte, pos)
38        } else {
39            f.write_str(self.s())
40        }
41    }
42}
43
44impl Error for ParseError {
45    // Minimum Rust is 1.15, Error::description was still required then
46    #[allow(deprecated)]
47    fn description(&self) -> &str {
48        self.s()
49    }
50}
51
52impl<'a> MimeIter<'a> {
53    /// A new iterator over mimes or media types
54    pub fn new(s: &'a str) -> Self {
55        Self {
56            pos: 0,
57            source: s,
58        }
59    }
60}
61
62impl<'a> Iterator for MimeIter<'a> {
63    type Item = Result<Mime, &'a str>;
64
65    fn next(&mut self) -> Option<Self::Item> {
66        let start = self.pos;
67        let len = self.source.bytes().len();
68
69        if start >= len {
70            return None
71        }
72
73        // Try parsing the whole remaining slice, until the end
74        match parse(&self.source[start ..len]) {
75            Ok(value) => {
76                self.pos = len;
77                Some(Ok(value))
78            }
79            Err(ParseError::InvalidToken { pos, .. }) => {
80                // The first token is immediately found to be wrong by `parse`. Skip it
81                if pos == 0 {
82                    self.pos += 1;
83                    return self.next()
84                }
85                let slice = &self.source[start .. start + pos];
86                // Try parsing the longest slice (until the first invalid token)
87                return match parse(slice) {
88                    Ok(mime) => {
89                        self.pos = start + pos + 1;
90                        Some(Ok(mime))
91                    }
92                    Err(_) => {
93                        if start + pos < len {
94                            // Skip this invalid slice,
95                            // try parsing the remaining slice in the next iteration
96                            self.pos = start + pos;
97                            Some(Err(slice))
98                        } else {
99                            None
100                        }
101                    }
102                }
103            }
104            // Do not process any other error condition: the slice is malformed and
105            // no character is found to be invalid: a character is missing
106            Err(_) => None,
107        }
108    }
109}
110
111pub fn parse(s: &str) -> Result<Mime, ParseError> {
112    if s == "*/*" {
113        return Ok(::STAR_STAR);
114    }
115
116    let mut iter = s.bytes().enumerate();
117    // toplevel
118    let mut start;
119    let slash;
120    loop {
121        match iter.next() {
122            Some((_, c)) if is_token(c) => (),
123            Some((i, b'/')) if i > 0 => {
124                slash = i;
125                start = i + 1;
126                break;
127            },
128            None => return Err(ParseError::MissingSlash), // EOF and no toplevel is no Mime
129            Some((pos, byte)) => return Err(ParseError::InvalidToken {
130                pos: pos,
131                byte: byte,
132            })
133        };
134
135    }
136
137    // sublevel
138    let mut plus = None;
139    loop {
140        match iter.next() {
141            Some((i, b'+')) if i > start => {
142                plus = Some(i);
143            },
144            Some((i, b';')) if i > start => {
145                start = i;
146                break;
147            },
148            Some((_, c)) if is_token(c) => (),
149            None => {
150                return Ok(Mime {
151                    source: Source::Dynamic(s.to_ascii_lowercase()),
152                    slash: slash,
153                    plus: plus,
154                    params: ParamSource::None,
155                });
156            },
157            Some((pos, byte)) => return Err(ParseError::InvalidToken {
158                pos: pos,
159                byte: byte,
160            })
161        };
162    }
163
164    // params
165    let params = params_from_str(s, &mut iter, start)?;
166
167    let src = match params {
168        ParamSource::Utf8(_)  => s.to_ascii_lowercase(),
169        ParamSource::Custom(semicolon, ref indices) => lower_ascii_with_params(s, semicolon, indices),
170        ParamSource::None => {
171            // Chop off the empty list
172            s[..start].to_ascii_lowercase()
173        }
174    };
175
176    Ok(Mime {
177        source: Source::Dynamic(src),
178        slash: slash,
179        plus: plus,
180        params: params,
181    })
182}
183
184
185fn params_from_str(s: &str, iter: &mut Enumerate<Bytes>, mut start: usize) -> Result<ParamSource, ParseError> {
186    let semicolon = start;
187    start += 1;
188    let mut params = ParamSource::None;
189    'params: while start < s.len() {
190        let name;
191        // name
192        'name: loop {
193            match iter.next() {
194                Some((i, b' ')) if i == start => {
195                    start = i + 1;
196                    continue 'params;
197                },
198                Some((_, c)) if is_token(c) => (),
199                Some((i, b'=')) if i > start => {
200                    name = Indexed(start, i);
201                    start = i + 1;
202                    break 'name;
203                },
204                None => return Err(ParseError::MissingEqual),
205                Some((pos, byte)) => return Err(ParseError::InvalidToken {
206                    pos: pos,
207                    byte: byte,
208                }),
209            }
210        }
211
212        let value;
213        // values must be restrict-name-char or "anything goes"
214        let mut is_quoted = false;
215
216        'value: loop {
217            if is_quoted {
218                match iter.next() {
219                    Some((i, b'"')) if i > start => {
220                        value = Indexed(start, i);
221                        break 'value;
222                    },
223                    Some((_, c)) if is_restricted_quoted_char(c) => (),
224                    None => return Err(ParseError::MissingQuote),
225                    Some((pos, byte)) => return Err(ParseError::InvalidToken {
226                        pos: pos,
227                        byte: byte,
228                    }),
229                }
230            } else {
231                match iter.next() {
232                    Some((i, b'"')) if i == start => {
233                        is_quoted = true;
234                        start = i + 1;
235                    },
236                    Some((_, c)) if is_token(c) => (),
237                    Some((i, b';')) if i > start => {
238                        value = Indexed(start, i);
239                        start = i + 1;
240                        break 'value;
241                    }
242                    None => {
243                        value = Indexed(start, s.len());
244                        start = s.len();
245                        break 'value;
246                    },
247
248                    Some((pos, byte)) => return Err(ParseError::InvalidToken {
249                        pos: pos,
250                        byte: byte,
251                    }),
252                }
253            }
254        }
255
256        if is_quoted {
257            'ws: loop {
258                match iter.next() {
259                    Some((i, b';')) => {
260                        // next param
261                        start = i + 1;
262                        break 'ws;
263                    },
264                    Some((_, b' ')) => {
265                        // skip whitespace
266                    },
267                    None => {
268                        // eof
269                        start = s.len();
270                        break 'ws;
271                    },
272                    Some((pos, byte)) => return Err(ParseError::InvalidToken {
273                        pos: pos,
274                        byte: byte,
275                    }),
276                }
277            }
278        }
279
280        match params {
281            ParamSource::Utf8(i) => {
282                let i = i + 2;
283                let charset = Indexed(i, "charset".len() + i);
284                let utf8 = Indexed(charset.1 + 1, charset.1 + "utf-8".len() + 1);
285                params = ParamSource::Custom(semicolon, vec![
286                    (charset, utf8),
287                    (name, value),
288                ]);
289            },
290            ParamSource::Custom(_, ref mut vec) => {
291                vec.push((name, value));
292            },
293            ParamSource::None => {
294                if semicolon + 2 == name.0 && CHARSET == &s[name.0..name.1] {
295                    if UTF_8 == &s[value.0..value.1] {
296                        params = ParamSource::Utf8(semicolon);
297                        continue 'params;
298                    }
299                }
300                params = ParamSource::Custom(semicolon, vec![(name, value)]);
301            },
302        }
303    }
304    Ok(params)
305}
306
307fn lower_ascii_with_params(s: &str, semi: usize, params: &[(Indexed, Indexed)]) -> String {
308    let mut owned = s.to_owned();
309    owned[..semi].make_ascii_lowercase();
310
311    for &(ref name, ref value) in params {
312        owned[name.0..name.1].make_ascii_lowercase();
313        // Since we just converted this part of the string to lowercase,
314        // we can skip the `Name == &str` unicase check and do a faster
315        // memcmp instead.
316        if &owned[name.0..name.1] == CHARSET.source {
317            owned[value.0..value.1].make_ascii_lowercase();
318        }
319    }
320
321    owned
322}
323
324// From [RFC6838](http://tools.ietf.org/html/rfc6838#section-4.2):
325//
326// > All registered media types MUST be assigned top-level type and
327// > subtype names.  The combination of these names serves to uniquely
328// > identify the media type, and the subtype name facet (or the absence
329// > of one) identifies the registration tree.  Both top-level type and
330// > subtype names are case-insensitive.
331// >
332// > Type and subtype names MUST conform to the following ABNF:
333// >
334// >     type-name = restricted-name
335// >     subtype-name = restricted-name
336// >
337// >     restricted-name = restricted-name-first *126restricted-name-chars
338// >     restricted-name-first  = ALPHA / DIGIT
339// >     restricted-name-chars  = ALPHA / DIGIT / "!" / "#" /
340// >                              "$" / "&" / "-" / "^" / "_"
341// >     restricted-name-chars =/ "." ; Characters before first dot always
342// >                                  ; specify a facet name
343// >     restricted-name-chars =/ "+" ; Characters after last plus always
344// >                                  ; specify a structured syntax suffix
345
346// However, [HTTP](https://tools.ietf.org/html/rfc7231#section-3.1.1.1):
347//
348// >     media-type = type "/" subtype *( OWS ";" OWS parameter )
349// >     type       = token
350// >     subtype    = token
351// >     parameter  = token "=" ( token / quoted-string )
352//
353// Where token is defined as:
354//
355// >     token = 1*tchar
356// >     tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
357// >        "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
358//
359// So, clearly, ¯\_(Ä_/¯
360
361macro_rules! byte_map {
362    ($($flag:expr,)*) => ([
363        $($flag != 0,)*
364    ])
365}
366
367static TOKEN_MAP: [bool; 256] = byte_map![
368    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
369    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
370    0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
371    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
372    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
373    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
374    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
375    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
376    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
377    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
378    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
379    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
380    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
381    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
382    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
383    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
384];
385
386fn is_token(c: u8) -> bool {
387    TOKEN_MAP[c as usize]
388}
389
390fn is_restricted_quoted_char(c: u8) -> bool {
391    c > 31 && c != 127
392}
393
394#[test]
395#[allow(warnings)] // ... ranges deprecated
396fn test_lookup_tables() {
397    for (i, &valid) in TOKEN_MAP.iter().enumerate() {
398        let i = i as u8;
399        let should = match i {
400            b'a'...b'z' |
401            b'A'...b'Z' |
402            b'0'...b'9' |
403            b'!' |
404            b'#' |
405            b'$' |
406            b'%' |
407            b'&' |
408            b'\'' |
409            b'*' |
410            b'+' |
411            b'-' |
412            b'.' |
413            b'^' |
414            b'_' |
415            b'`' |
416            b'|' |
417            b'~' => true,
418            _ => false
419        };
420        assert_eq!(valid, should, "{:?} ({}) should be {}", i as char, i, should);
421    }
422}
423
424#[test]
425fn test_parse_iterator() {
426    let mut iter = MimeIter::new("application/json, application/json");
427    assert_eq!(iter.next().unwrap().unwrap(), parse("application/json").unwrap());
428    assert_eq!(iter.next().unwrap().unwrap(), parse("application/json").unwrap());
429    assert_eq!(iter.next(), None);
430
431    let mut iter = MimeIter::new("application/json");
432    assert_eq!(iter.next().unwrap().unwrap(), parse("application/json").unwrap());
433    assert_eq!(iter.next(), None);
434
435    let mut iter = MimeIter::new("application/json;  ");
436    assert_eq!(iter.next().unwrap().unwrap(), parse("application/json").unwrap());
437    assert_eq!(iter.next(), None);
438}
439
440#[test]
441fn test_parse_iterator_invalid() {
442    let mut iter = MimeIter::new("application/json, invalid, application/json");
443    assert_eq!(iter.next().unwrap().unwrap(), parse("application/json").unwrap());
444    assert_eq!(iter.next().unwrap().unwrap_err(), "invalid");
445    assert_eq!(iter.next().unwrap().unwrap(), parse("application/json").unwrap());
446    assert_eq!(iter.next(), None);
447}
448
449#[test]
450fn test_parse_iterator_all_invalid() {
451    let mut iter = MimeIter::new("application/json, text/html");
452    assert_eq!(iter.next().unwrap().unwrap_err(), "application/json");
453    assert_eq!(iter.next(), None);
454}