proptest/arbitrary/_std/string.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
//-
// Copyright 2017, 2018 The proptest developers
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Arbitrary implementations for `std::string`.
use crate::std_facade::{Box, String, Vec};
use std::iter;
use std::rc::Rc;
use std::slice;
use std::sync::Arc;
multiplex_alloc! {
alloc::string::FromUtf8Error, ::std::string::FromUtf8Error,
alloc::string::FromUtf16Error, ::std::string::FromUtf16Error
}
use crate::arbitrary::*;
use crate::collection;
use crate::strategy::statics::static_map;
use crate::strategy::*;
use crate::string::StringParam;
impl Arbitrary for String {
type Parameters = StringParam;
type Strategy = &'static str;
/// ## Panics
///
/// This implementation panics if the input is not a valid regex proptest
/// can handle.
fn arbitrary_with(args: Self::Parameters) -> Self::Strategy {
args.into()
}
}
macro_rules! dst_wrapped {
($($w: ident),*) => {
$(arbitrary!($w<str>, MapInto<StrategyFor<String>, Self>, StringParam;
a => any_with::<String>(a).prop_map_into()
);)*
};
}
dst_wrapped!(Box, Rc, Arc);
lazy_just!(FromUtf16Error, || String::from_utf16(&[0xD800])
.unwrap_err());
// This is a void-like type, it needs to be handled by the user of
// the type by simply never constructing the variant in an enum or for
// structs by inductively not generating the struct.
// The same applies to ! and Infallible.
// generator!(ParseError, || panic!());
arbitrary!(FromUtf8Error, SFnPtrMap<BoxedStrategy<Vec<u8>>, Self>;
static_map(not_utf8_bytes(true).boxed(),
|bs| String::from_utf8(bs).unwrap_err())
);
/// This strategy produces sequences of bytes that are guaranteed to be illegal
/// wrt. UTF-8 with the goal of producing a suffix of bytes in the end of
/// an otherwise legal UTF-8 string that causes the string to be illegal.
/// This is used primarily to generate the `Utf8Error` type and similar.
pub(crate) fn not_utf8_bytes(
allow_null: bool,
) -> impl Strategy<Value = Vec<u8>> {
let prefix = collection::vec(any::<char>(), ..::std::u16::MAX as usize);
let suffix = gen_el_bytes(allow_null);
(prefix, suffix).prop_map(move |(prefix_bytes, el_bytes)| {
let iter = prefix_bytes.iter();
let string: String = if allow_null {
iter.collect()
} else {
iter.filter(|&&x| x != '\u{0}').collect()
};
let mut bytes = string.into_bytes();
bytes.extend(el_bytes.into_iter());
bytes
})
}
/// Stands for "error_length" bytes and contains a suffix of bytes that
/// will cause the whole string to become invalid UTF-8.
/// See `gen_el_bytes` for more details.
#[derive(Debug)]
enum ELBytes {
B1([u8; 1]),
B2([u8; 2]),
B3([u8; 3]),
B4([u8; 4]),
}
impl<'a> IntoIterator for &'a ELBytes {
type Item = u8;
type IntoIter = iter::Cloned<slice::Iter<'a, u8>>;
fn into_iter(self) -> Self::IntoIter {
use self::ELBytes::*;
(match *self {
B1(ref a) => a.iter(),
B2(ref a) => a.iter(),
B3(ref a) => a.iter(),
B4(ref a) => a.iter(),
})
.cloned()
}
}
// By analysis of run_utf8_validation defined at:
// https://doc.rust-lang.org/nightly/src/core/str/mod.rs.html#1429
// we know that .error_len() \in {None, Some(1), Some(2), Some(3)}.
// We represent this with the range [0..4) and generate a valid
// sequence from that.
fn gen_el_bytes(allow_null: bool) -> impl Strategy<Value = ELBytes> {
fn b1(a: u8) -> ELBytes {
ELBytes::B1([a])
}
fn b2(a: (u8, u8)) -> ELBytes {
ELBytes::B2([a.0, a.1])
}
fn b3(a: ((u8, u8), u8)) -> ELBytes {
ELBytes::B3([(a.0).0, (a.0).1, a.1])
}
fn b4(a: ((u8, u8), u8, u8)) -> ELBytes {
ELBytes::B4([(a.0).0, (a.0).1, a.1, a.2])
}
/*
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
const TAG_CONT_U8: u8 = 0b1000_0000;
*/
// Continuation byte:
let succ_byte = 0x80u8..0xC0u8;
// Do we allow the nul byte or not?
let start_byte = if allow_null { 0x00u8 } else { 0x01u8 };
// Invalid continuation byte:
let fail_byte = prop_oneof![start_byte..0x7Fu8, 0xC1u8..];
// Matches zero in the UTF8_CHAR_WIDTH table above.
let byte0_w0 = prop_oneof![0x80u8..0xC0u8, 0xF5u8..];
// Start of a 3 (width) byte sequence:
// Leads here: https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1479
let byte0_w2 = 0xC2u8..0xE0u8;
// Start of a 3 (width) byte sequence:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1484
// See the left column in the match.
let byte0_w3 = 0xE0u8..0xF0u8;
// Start of a 4 (width) byte sequence:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1495
// See the left column in the match.
let byte0_w4 = 0xF0u8..0xF5u8;
// The 2 first (valid) bytes of a 3 (width) byte sequence:
// The first byte is byte0_w3. The second is the ones produced on the right.
let byte01_w3 = byte0_w3.clone().prop_flat_map(|x| {
(
Just(x),
match x {
0xE0u8 => 0xA0u8..0xC0u8,
0xE1u8..=0xECu8 => 0x80u8..0xC0u8,
0xEDu8 => 0x80u8..0xA0u8,
0xEEu8..=0xEFu8 => 0x80u8..0xA0u8,
_ => panic!(),
},
)
});
// In a 3 (width) byte sequence, an invalid second byte is chosen such that
// it will yield an error length of Some(1). The second byte is on
// the right of the match arms.
let byte01_w3_e1 = byte0_w3.clone().prop_flat_map(move |x| {
(
Just(x),
match x {
0xE0u8 => prop_oneof![start_byte..0xA0u8, 0xC0u8..],
0xE1u8..=0xECu8 => prop_oneof![start_byte..0x80u8, 0xC0u8..],
0xEDu8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
0xEEu8..=0xEFu8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
_ => panic!(),
},
)
});
// In a 4 (width) byte sequence, an invalid second byte is chosen such that
// it will yield an error length of Some(1). The second byte is on
// the right of the match arms.
let byte01_w4_e1 = byte0_w4.clone().prop_flat_map(move |x| {
(
Just(x),
match x {
0xF0u8 => prop_oneof![start_byte..0x90u8, 0xA0u8..],
0xF1u8..=0xF3u8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
0xF4u8 => prop_oneof![start_byte..0x80u8, 0x90u8..],
_ => panic!(),
},
)
});
// The 2 first (valid) bytes of a 4 (width) byte sequence:
// The first byte is byte0_w4. The second is the ones produced on the right.
let byte01_w4 = byte0_w4.clone().prop_flat_map(|x| {
(
Just(x),
match x {
0xF0u8 => 0x90u8..0xA0u8,
0xF1u8..=0xF3u8 => 0x80u8..0xA0u8,
0xF4u8 => 0x80u8..0x90u8,
_ => panic!(),
},
)
});
prop_oneof![
// error_len = None
// These are all happen when next!() fails to provide a byte.
prop_oneof![
// width = 2
// lacking 1 bytes:
static_map(byte0_w2.clone(), b1),
// width = 3
// lacking 2 bytes:
static_map(byte0_w3, b1),
// lacking 1 bytes:
static_map(byte01_w3.clone(), b2),
// width = 4
// lacking 3 bytes:
static_map(byte0_w4, b1),
// lacking 2 bytes:
static_map(byte01_w4.clone(), b2),
// lacking 1 byte:
static_map((byte01_w4.clone(), succ_byte.clone()), b3),
],
// error_len = Some(1)
prop_oneof![
// width = 1 is not represented.
// width = 0
// path taken:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1508
static_map(byte0_w0, b1),
// width = 2
// path taken:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1480
static_map((byte0_w2, fail_byte.clone()), b2),
// width = 3
// path taken:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1488
static_map(byte01_w3_e1, b2),
// width = 4
// path taken:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1499
static_map(byte01_w4_e1, b2),
],
// error_len = Some(2)
static_map(
prop_oneof![
// width = 3
// path taken:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1491
(byte01_w3, fail_byte.clone()),
// width = 4
// path taken:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1502
(byte01_w4.clone(), fail_byte.clone())
],
b3
),
// error_len = Some(3), width = 4
// path taken:
// https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1505
static_map((byte01_w4, succ_byte, fail_byte), b4),
]
.boxed()
}
#[cfg(test)]
mod test {
no_panic_test!(
string => String,
str_box => Box<str>,
str_rc => Rc<str>,
str_arc => Arc<str>,
from_utf16_error => FromUtf16Error,
from_utf8_error => FromUtf8Error
);
}