From 49c848ed42c37003fe6e26f19556923ff357794d Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Mon, 11 Sep 2023 21:25:46 -0700 Subject: [PATCH] Optimize wcstod_underscores Prior to this change, wcstod_underscores would create a "pruned" string without underscores, and then parse that. Switch instead to a filtered Iterator, which can ignore underscores as it traverses the string. Add a test that completes in a reasonable time only if wcstod_underscores is not quadratic. (cherry picked from commit b4577c10e5cf9239bc93c605f431a56d98266a1e) --- src/wutil/wcstod.rs | 127 +++++++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 56 deletions(-) diff --git a/src/wutil/wcstod.rs b/src/wutil/wcstod.rs index 32b15cb08..2ce71026f 100644 --- a/src/wutil/wcstod.rs +++ b/src/wutil/wcstod.rs @@ -224,63 +224,57 @@ pub fn wcstod_underscores(s: Chars, consumed: &mut usize) -> Result 0 { + remaining_chars -= 1; + true + } else { + false } - } - } - if is_inf_nan { - let f = wcstod_inner(chars, '.', consumed)?; - *consumed += leading_whitespace; - return Ok(f); - } - // We build a string to pass to the system wcstod, pruned of underscores. We will take all - // leading alphanumeric characters that can appear in a strtod numeric literal, dots (.), and - // signs (+/-). In order to be more clever, for example to stop earlier in the case of strings - // like "123xxxxx", we would need to do a full parse, because sometimes 'a' is a hex digit and - // sometimes it is the end of the parse, sometimes a dot '.' is a decimal delimiter and - // sometimes it is the end of the valid parse, as in "1_2.3_4.5_6", etc. - let mut pruned = vec![]; - // We keep track of the positions *in the pruned string* where there used to be underscores. We - // will pass the pruned version of the input string to the system wcstod, which in turn will - // tell us how many characters it consumed. Then we will set our own endptr based on (1) the - // number of characters consumed from the pruned string, and (2) how many underscores came - // before the last consumed character. The alternative to doing it this way (for example, "only - // deleting the correct underscores") would require actually parsing the input string, so that - // we can know when to stop grabbing characters and dropping underscores, as in "1_2.3_4.5_6". - let mut underscores = vec![]; - // If we wanted to future-proof against a strtod from the future that, say, allows octal - // literals using 0o, etc., we could just use iswalnum, instead of iswxdigit and P/p/X/x checks. - for c in chars.take_while(|&c| c.is_ascii_hexdigit() || "PpXx._".contains(c) || is_sign(c)) { - if c == '_' { - underscores.push(pruned.len()); - } else { - pruned.push(c) - } - } - - let mut pruned_consumed = 0; - let f = wcstod_inner(pruned.into_iter(), '.', &mut pruned_consumed)?; - let underscores_consumed = underscores - .into_iter() - .take_while(|&n| n <= pruned_consumed) + }) .count(); - - *consumed = leading_whitespace + pruned_consumed + underscores_consumed; + *consumed = leading_whitespace + consumed_chars_with_uscores; Ok(f) } @@ -711,7 +705,7 @@ fn wcstod_underscores() { assert_eq!(test("1 "), Ok((1.0, 1))); assert_eq!(test("infinity_"), Ok((f64::INFINITY, 8))); assert_eq!(test(" -INFINITY"), Ok((f64::NEG_INFINITY, 10))); - assert_eq!(test("_infinity"), Err(Error::Empty)); + assert_eq!(test("_infinity"), Err(Error::InvalidChar)); { let (f, n) = test("nan(0)").unwrap(); assert!(f.is_nan()); @@ -722,7 +716,7 @@ fn wcstod_underscores() { assert!(f.is_nan()); assert_eq!(n, 3); } - assert_eq!(test("_nan(0)"), Err(Error::Empty)); + assert_eq!(test("_nan(0)"), Err(Error::InvalidChar)); // We don't strip the underscores in this commented-out test case, and the behavior is // implementation-defined, so we don't actually know how many characters will get consumed. On // macOS the strtod man page only says what happens with an alphanumeric string passed to nan(), @@ -735,4 +729,25 @@ fn wcstod_underscores() { assert_eq!(test("Also none"), Err(Error::InvalidChar)); assert_eq!(test(" Also none"), Err(Error::InvalidChar)); } + + #[test] + fn wcstod_underscores_repeated() { + // A test inspired by #9984. + // Emulate builtin_math by parsing all the numbers out of a huge string with underscores. + // This should complete quickly! + let count = 25_000; + let mut repeated_string = "12_34.56+".repeat(count); + repeated_string.pop(); // trailing + + let expected = 12_34.56 * (count as f64); + + let mut sum = 0.0; + let mut offset = 0; + while offset < repeated_string.len() { + let mut consumed = 0; + sum += super::wcstod_underscores(repeated_string[offset..].chars(), &mut consumed) + .unwrap(); + offset += consumed; + } + assert!((sum - expected).abs() <= 1e-4); + } }