Optimize wcstod_underscores

Prior to this change, wcstod_underscores would create a "pruned" string without underscores, and then parse that. Switch instead to a filtered Iterator, which can ignore underscores as it traverses the string. Add a test that completes in a reasonable time only if wcstod_underscores is not quadratic. (cherry picked from commit b4577c10e5cf9239bc93c605f431a56d98266a1e)
2026-06-08 02:31:18 -03:00 · 2023-09-11 21:25:46 -07:00
parent 82c4896809
commit 49c848ed42
1 changed files with 71 additions and 56 deletions
--- a/src/wutil/wcstod.rs
+++ b/src/wutil/wcstod.rs
@@ -224,63 +224,57 @@ pub fn wcstod_underscores<Chars>(s: Chars, consumed: &mut usize) -> Result<f64,
        }
    }

-    let is_sign = |c: char| "+-".contains(c);
-    let is_inf_or_nan_char = |c: char| "iInN".contains(c);
+    // Detect "infinity" and "nan" case-insensitively.
+    {
+        // Skip leading underscores.
+        let mut inf_nan_chars = chars.clone();
+        while inf_nan_chars.peek() == Some(&'_') {
+            inf_nan_chars.next();
+        }
+        // Skip at most one leading sign.
+        if matches!(inf_nan_chars.peek(), Some('+') | Some('-')) {
+            inf_nan_chars.next();
+        }

-    // We don't do any underscore-stripping for infinity/NaN.
-    let mut is_inf_nan = false;
-    if let Some(&c1) = chars.peek() {
-        if is_inf_or_nan_char(c1) {
-            is_inf_nan = true;
-        } else if is_sign(c1) {
-            // FIXME make this more efficient
-            let mut copy = chars.clone();
-            copy.next();
-            if let Some(&c2) = copy.peek() {
-                if is_inf_or_nan_char(c2) {
-                    is_inf_nan = true;
-                }
+        // Look for characters leading "infinity" or "nan", case-insensitive.
+        if inf_nan_chars.next().map_or(false, |c| "iInN".contains(c)) {
+            // We think it's inf or nan, so use the parser without underscores.
+            // Note that leading underscores will now trigger an error.
+            let f = wcstod_inner(chars, '.', consumed)?;
+            *consumed += leading_whitespace;
+            return Ok(f);
+        }
+    }
+
+    // Parse with an iterator which filters out underscores.
+    let chars_no_uscore = chars.clone().filter(|&c| c != '_');
+
+    // No spaces allowed after underscores.
+    if let Some(c) = chars_no_uscore.clone().next() {
+        if c.is_whitespace() {
+            return Err(Error::Empty);
+        }
+    }
+
+    let mut non_uscores_consumed = 0;
+    let f = wcstod_inner(chars_no_uscore, '.', &mut non_uscores_consumed)?;
+
+    // We consumed non_underscores_consumed characters, skipping underscores.
+    // Count how many we skipped. Also skip trailing underscores.
+    let mut remaining_chars = non_uscores_consumed;
+    let consumed_chars_with_uscores = chars
+        .take_while(|&c| {
+            if c == '_' {
+                true
+            } else if remaining_chars > 0 {
+                remaining_chars -= 1;
+                true
+            } else {
+                false
            }
-        }
-    }
-    if is_inf_nan {
-        let f = wcstod_inner(chars, '.', consumed)?;
-        *consumed += leading_whitespace;
-        return Ok(f);
-    }
-    // We build a string to pass to the system wcstod, pruned of underscores. We will take all
-    // leading alphanumeric characters that can appear in a strtod numeric literal, dots (.), and
-    // signs (+/-). In order to be more clever, for example to stop earlier in the case of strings
-    // like "123xxxxx", we would need to do a full parse, because sometimes 'a' is a hex digit and
-    // sometimes it is the end of the parse, sometimes a dot '.' is a decimal delimiter and
-    // sometimes it is the end of the valid parse, as in "1_2.3_4.5_6", etc.
-    let mut pruned = vec![];
-    // We keep track of the positions *in the pruned string* where there used to be underscores. We
-    // will pass the pruned version of the input string to the system wcstod, which in turn will
-    // tell us how many characters it consumed. Then we will set our own endptr based on (1) the
-    // number of characters consumed from the pruned string, and (2) how many underscores came
-    // before the last consumed character. The alternative to doing it this way (for example, "only
-    // deleting the correct underscores") would require actually parsing the input string, so that
-    // we can know when to stop grabbing characters and dropping underscores, as in "1_2.3_4.5_6".
-    let mut underscores = vec![];
-    // If we wanted to future-proof against a strtod from the future that, say, allows octal
-    // literals using 0o, etc., we could just use iswalnum, instead of iswxdigit and P/p/X/x checks.
-    for c in chars.take_while(|&c| c.is_ascii_hexdigit() || "PpXx._".contains(c) || is_sign(c)) {
-        if c == '_' {
-            underscores.push(pruned.len());
-        } else {
-            pruned.push(c)
-        }
-    }
-
-    let mut pruned_consumed = 0;
-    let f = wcstod_inner(pruned.into_iter(), '.', &mut pruned_consumed)?;
-    let underscores_consumed = underscores
-        .into_iter()
-        .take_while(|&n| n <= pruned_consumed)
+        })
        .count();
-
-    *consumed = leading_whitespace + pruned_consumed + underscores_consumed;
+    *consumed = leading_whitespace + consumed_chars_with_uscores;
    Ok(f)
 }

@@ -711,7 +705,7 @@ fn wcstod_underscores() {
        assert_eq!(test("1 "), Ok((1.0, 1)));
        assert_eq!(test("infinity_"), Ok((f64::INFINITY, 8)));
        assert_eq!(test(" -INFINITY"), Ok((f64::NEG_INFINITY, 10)));
-        assert_eq!(test("_infinity"), Err(Error::Empty));
+        assert_eq!(test("_infinity"), Err(Error::InvalidChar));
        {
            let (f, n) = test("nan(0)").unwrap();
            assert!(f.is_nan());
@@ -722,7 +716,7 @@ fn wcstod_underscores() {
            assert!(f.is_nan());
            assert_eq!(n, 3);
        }
-        assert_eq!(test("_nan(0)"), Err(Error::Empty));
+        assert_eq!(test("_nan(0)"), Err(Error::InvalidChar));
        // We don't strip the underscores in this commented-out test case, and the behavior is
        // implementation-defined, so we don't actually know how many characters will get consumed. On
        // macOS the strtod man page only says what happens with an alphanumeric string passed to nan(),
@@ -735,4 +729,25 @@ fn wcstod_underscores() {
        assert_eq!(test("Also none"), Err(Error::InvalidChar));
        assert_eq!(test(" Also none"), Err(Error::InvalidChar));
    }
+
+    #[test]
+    fn wcstod_underscores_repeated() {
+        // A test inspired by #9984.
+        // Emulate builtin_math by parsing all the numbers out of a huge string with underscores.
+        // This should complete quickly!
+        let count = 25_000;
+        let mut repeated_string = "12_34.56+".repeat(count);
+        repeated_string.pop(); // trailing +
+        let expected = 12_34.56 * (count as f64);
+
+        let mut sum = 0.0;
+        let mut offset = 0;
+        while offset < repeated_string.len() {
+            let mut consumed = 0;
+            sum += super::wcstod_underscores(repeated_string[offset..].chars(), &mut consumed)
+                .unwrap();
+            offset += consumed;
+        }
+        assert!((sum - expected).abs() <= 1e-4);
+    }
 }