Optimize wcstod_underscores

Prior to this change, wcstod_underscores would create a "pruned" string without
underscores, and then parse that. Switch instead to a filtered Iterator, which
can ignore underscores as it traverses the string.

Add a test that completes in a reasonable time only if wcstod_underscores is not
quadratic.

(cherry picked from commit b4577c10e5cf9239bc93c605f431a56d98266a1e)
This commit is contained in:
ridiculousfish
2023-09-11 21:25:46 -07:00
committed by Fabian Boehm
parent 82c4896809
commit 49c848ed42

View File

@@ -224,63 +224,57 @@ pub fn wcstod_underscores<Chars>(s: Chars, consumed: &mut usize) -> Result<f64,
}
}
let is_sign = |c: char| "+-".contains(c);
let is_inf_or_nan_char = |c: char| "iInN".contains(c);
// Detect "infinity" and "nan" case-insensitively.
{
// Skip leading underscores.
let mut inf_nan_chars = chars.clone();
while inf_nan_chars.peek() == Some(&'_') {
inf_nan_chars.next();
}
// Skip at most one leading sign.
if matches!(inf_nan_chars.peek(), Some('+') | Some('-')) {
inf_nan_chars.next();
}
// We don't do any underscore-stripping for infinity/NaN.
let mut is_inf_nan = false;
if let Some(&c1) = chars.peek() {
if is_inf_or_nan_char(c1) {
is_inf_nan = true;
} else if is_sign(c1) {
// FIXME make this more efficient
let mut copy = chars.clone();
copy.next();
if let Some(&c2) = copy.peek() {
if is_inf_or_nan_char(c2) {
is_inf_nan = true;
}
// Look for characters leading "infinity" or "nan", case-insensitive.
if inf_nan_chars.next().map_or(false, |c| "iInN".contains(c)) {
// We think it's inf or nan, so use the parser without underscores.
// Note that leading underscores will now trigger an error.
let f = wcstod_inner(chars, '.', consumed)?;
*consumed += leading_whitespace;
return Ok(f);
}
}
// Parse with an iterator which filters out underscores.
let chars_no_uscore = chars.clone().filter(|&c| c != '_');
// No spaces allowed after underscores.
if let Some(c) = chars_no_uscore.clone().next() {
if c.is_whitespace() {
return Err(Error::Empty);
}
}
let mut non_uscores_consumed = 0;
let f = wcstod_inner(chars_no_uscore, '.', &mut non_uscores_consumed)?;
// We consumed non_underscores_consumed characters, skipping underscores.
// Count how many we skipped. Also skip trailing underscores.
let mut remaining_chars = non_uscores_consumed;
let consumed_chars_with_uscores = chars
.take_while(|&c| {
if c == '_' {
true
} else if remaining_chars > 0 {
remaining_chars -= 1;
true
} else {
false
}
}
}
if is_inf_nan {
let f = wcstod_inner(chars, '.', consumed)?;
*consumed += leading_whitespace;
return Ok(f);
}
// We build a string to pass to the system wcstod, pruned of underscores. We will take all
// leading alphanumeric characters that can appear in a strtod numeric literal, dots (.), and
// signs (+/-). In order to be more clever, for example to stop earlier in the case of strings
// like "123xxxxx", we would need to do a full parse, because sometimes 'a' is a hex digit and
// sometimes it is the end of the parse, sometimes a dot '.' is a decimal delimiter and
// sometimes it is the end of the valid parse, as in "1_2.3_4.5_6", etc.
let mut pruned = vec![];
// We keep track of the positions *in the pruned string* where there used to be underscores. We
// will pass the pruned version of the input string to the system wcstod, which in turn will
// tell us how many characters it consumed. Then we will set our own endptr based on (1) the
// number of characters consumed from the pruned string, and (2) how many underscores came
// before the last consumed character. The alternative to doing it this way (for example, "only
// deleting the correct underscores") would require actually parsing the input string, so that
// we can know when to stop grabbing characters and dropping underscores, as in "1_2.3_4.5_6".
let mut underscores = vec![];
// If we wanted to future-proof against a strtod from the future that, say, allows octal
// literals using 0o, etc., we could just use iswalnum, instead of iswxdigit and P/p/X/x checks.
for c in chars.take_while(|&c| c.is_ascii_hexdigit() || "PpXx._".contains(c) || is_sign(c)) {
if c == '_' {
underscores.push(pruned.len());
} else {
pruned.push(c)
}
}
let mut pruned_consumed = 0;
let f = wcstod_inner(pruned.into_iter(), '.', &mut pruned_consumed)?;
let underscores_consumed = underscores
.into_iter()
.take_while(|&n| n <= pruned_consumed)
})
.count();
*consumed = leading_whitespace + pruned_consumed + underscores_consumed;
*consumed = leading_whitespace + consumed_chars_with_uscores;
Ok(f)
}
@@ -711,7 +705,7 @@ fn wcstod_underscores() {
assert_eq!(test("1 "), Ok((1.0, 1)));
assert_eq!(test("infinity_"), Ok((f64::INFINITY, 8)));
assert_eq!(test(" -INFINITY"), Ok((f64::NEG_INFINITY, 10)));
assert_eq!(test("_infinity"), Err(Error::Empty));
assert_eq!(test("_infinity"), Err(Error::InvalidChar));
{
let (f, n) = test("nan(0)").unwrap();
assert!(f.is_nan());
@@ -722,7 +716,7 @@ fn wcstod_underscores() {
assert!(f.is_nan());
assert_eq!(n, 3);
}
assert_eq!(test("_nan(0)"), Err(Error::Empty));
assert_eq!(test("_nan(0)"), Err(Error::InvalidChar));
// We don't strip the underscores in this commented-out test case, and the behavior is
// implementation-defined, so we don't actually know how many characters will get consumed. On
// macOS the strtod man page only says what happens with an alphanumeric string passed to nan(),
@@ -735,4 +729,25 @@ fn wcstod_underscores() {
assert_eq!(test("Also none"), Err(Error::InvalidChar));
assert_eq!(test(" Also none"), Err(Error::InvalidChar));
}
#[test]
fn wcstod_underscores_repeated() {
// A test inspired by #9984.
// Emulate builtin_math by parsing all the numbers out of a huge string with underscores.
// This should complete quickly!
let count = 25_000;
let mut repeated_string = "12_34.56+".repeat(count);
repeated_string.pop(); // trailing +
let expected = 12_34.56 * (count as f64);
let mut sum = 0.0;
let mut offset = 0;
while offset < repeated_string.len() {
let mut consumed = 0;
sum += super::wcstod_underscores(repeated_string[offset..].chars(), &mut consumed)
.unwrap();
offset += consumed;
}
assert!((sum - expected).abs() <= 1e-4);
}
}