printf: improve Unicode escape + PUA handling

The old handling of Unicode escape sequences seems partially obsolete and unnecessarily complicated. For our purposes, Rust's u32 to char parsing should be exactly what we want. Due to fish treating certain code points specially, we need to check if the provided character is among the special chars, and if so we need to encode it using our PUA scheme. This was not done in the old code, but is now. Add tests for this. Fixes #12081 Rework the error message for invalid code points. The old message about invalid code points not yet being supported seems odd. I don't think we should support this, so stop implying that we might do so in the future. In the new code, indicating that a Unicode character is out of range is also not ideal, since the range is not contiguous. E.g. `\uD800` is invalid, but \U0010FFFF is valid. Refrain from referring to a "range" and instead just state that the character is invalid. Move formatting of the escape sequence into Rust's `format!` to simplify porting to Fluent. Closes #12118
2026-05-30 19:41:15 -03:00 · 2025-11-28 01:58:27 +01:00
parent 22a252d064
commit 2d29749eae
10 changed files with 63 additions and 71 deletions
--- a/po/de.po
+++ b/po/de.po
@@ -1187,9 +1187,6 @@ msgstr ""
 msgid "Invalid arguments"
 msgstr "Ungültige Argumente"

-msgid "Invalid code points not yet supported by printf"
-msgstr "Ungültige Codepunkte werden von printf noch nicht unterstützt"
-
 msgid "Invalid index value"
 msgstr "Ungültiger Indexwert"

@@ -1292,6 +1289,10 @@ msgstr "Keine Funktion"
 msgid "Not a number"
 msgstr "Keine Zahl"

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr "Kein gültiges Unicodezeichen: %s"
+
 msgid "Notifications about universal variable changes"
 msgstr ""

@@ -1638,10 +1639,6 @@ msgstr "Unerwartetes Stringende, eckige Klammern passen nicht"
 msgid "Unexpected token"
 msgstr "Unerwartetes Token"

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr "Unicodezeichen außerhalb des gültigen Bereichs: \\%c%0*x"
-
 msgid "Unknown"
 msgstr "Unbekannt"

--- a/po/en.po
+++ b/po/en.po
@@ -1185,9 +1185,6 @@ msgstr ""
 msgid "Invalid arguments"
 msgstr ""

-msgid "Invalid code points not yet supported by printf"
-msgstr ""
-
 msgid "Invalid index value"
 msgstr ""

@@ -1290,6 +1287,10 @@ msgstr ""
 msgid "Not a number"
 msgstr ""

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr "Not a valid Unicode character: %s"
+
 msgid "Notifications about universal variable changes"
 msgstr ""

@@ -1636,10 +1637,6 @@ msgstr "Unexpected end of string, square brackets do not match"
 msgid "Unexpected token"
 msgstr ""

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr "Unicode character out of range: \\%c%0*x"
-
 msgid "Unknown"
 msgstr "Unknown"

--- a/po/fr.po
+++ b/po/fr.po
@@ -1316,9 +1316,6 @@ msgstr ""
 msgid "Invalid arguments"
 msgstr ""

-msgid "Invalid code points not yet supported by printf"
-msgstr ""
-
 msgid "Invalid index value"
 msgstr ""

@@ -1421,6 +1418,10 @@ msgstr "Pas une fonction"
 msgid "Not a number"
 msgstr ""

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr ""
+
 msgid "Notifications about universal variable changes"
 msgstr ""

@@ -1767,10 +1768,6 @@ msgstr "Fin de chaîne inattendue, les crochets ne sont pas refermés"
 msgid "Unexpected token"
 msgstr ""

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr "Caractère Unicode hors limite: \\%c%0*x"
-
 msgid "Unknown"
 msgstr "Inconnu"

--- a/po/pl.po
+++ b/po/pl.po
@@ -1181,9 +1181,6 @@ msgstr ""
 msgid "Invalid arguments"
 msgstr ""

-msgid "Invalid code points not yet supported by printf"
-msgstr ""
-
 msgid "Invalid index value"
 msgstr ""

@@ -1286,6 +1283,10 @@ msgstr ""
 msgid "Not a number"
 msgstr ""

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr ""
+
 msgid "Notifications about universal variable changes"
 msgstr ""

@@ -1632,10 +1633,6 @@ msgstr ""
 msgid "Unexpected token"
 msgstr ""

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr ""
-
 msgid "Unknown"
 msgstr "Nieznany"

--- a/po/pt_BR.po
+++ b/po/pt_BR.po
@@ -1186,9 +1186,6 @@ msgstr ""
 msgid "Invalid arguments"
 msgstr ""

-msgid "Invalid code points not yet supported by printf"
-msgstr ""
-
 msgid "Invalid index value"
 msgstr ""

@@ -1291,6 +1288,10 @@ msgstr ""
 msgid "Not a number"
 msgstr ""

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr ""
+
 msgid "Notifications about universal variable changes"
 msgstr ""

@@ -1637,10 +1638,6 @@ msgstr "Final inesperado de string, colchetes não batem"
 msgid "Unexpected token"
 msgstr ""

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr "Caracter Unicode fora dos limites: \\%c%0*x"
-
 msgid "Unknown"
 msgstr "Desconhecido"

--- a/po/sv.po
+++ b/po/sv.po
@@ -1182,9 +1182,6 @@ msgstr ""
 msgid "Invalid arguments"
 msgstr ""

-msgid "Invalid code points not yet supported by printf"
-msgstr ""
-
 msgid "Invalid index value"
 msgstr ""

@@ -1287,6 +1284,10 @@ msgstr ""
 msgid "Not a number"
 msgstr ""

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr ""
+
 msgid "Notifications about universal variable changes"
 msgstr ""

@@ -1633,10 +1634,6 @@ msgstr ""
 msgid "Unexpected token"
 msgstr ""

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr "Unicodetecken utanför giltigt spann: \\%c%0*x"
-
 msgid "Unknown"
 msgstr "Okänd"

--- a/po/zh_CN.po
+++ b/po/zh_CN.po
@@ -1211,9 +1211,6 @@ msgstr "主题监视器的内部细节"
 msgid "Invalid arguments"
 msgstr "无效参数"

-msgid "Invalid code points not yet supported by printf"
-msgstr "尚未由 printf 支持的无效码点"
-
 msgid "Invalid index value"
 msgstr "无效索引值"

@@ -1316,6 +1313,10 @@ msgstr "不是一个函数"
 msgid "Not a number"
 msgstr "非数字"

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr ""
+
 msgid "Notifications about universal variable changes"
 msgstr "通用变量变更通知"

@@ -1662,10 +1663,6 @@ msgstr "字符串意外结束，方括号不匹配"
 msgid "Unexpected token"
 msgstr "意外的记号"

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr "Unicode 字符超出范围：\\%c%0*x"
-
 msgid "Unknown"
 msgstr "未知"

--- a/po/zh_TW.po
+++ b/po/zh_TW.po
@@ -1185,9 +1185,6 @@ msgstr "主題監聽的內部資訊"
 msgid "Invalid arguments"
 msgstr "引數無效"

-msgid "Invalid code points not yet supported by printf"
-msgstr "printf 尚不支援無效的碼位"
-
 msgid "Invalid index value"
 msgstr "索引值無效"

@@ -1290,6 +1287,10 @@ msgstr "不是函式"
 msgid "Not a number"
 msgstr "不是數字"

+#, c-format
+msgid "Not a valid Unicode character: %s"
+msgstr ""
+
 msgid "Notifications about universal variable changes"
 msgstr "通域變數變更通知"

@@ -1637,10 +1638,6 @@ msgstr "非預期的字串結尾，中括弧不對稱"
 msgid "Unexpected token"
 msgstr "非預期的詞元"

-#, c-format
-msgid "Unicode character out of range: \\%c%0*x"
-msgstr "Unicode 字元超出範圍：\\%c%0*x"
-
 msgid "Unknown"
 msgstr "未知"

--- a/src/builtins/printf.rs
+++ b/src/builtins/printf.rs
@@ -50,7 +50,7 @@

 use super::prelude::*;
 use crate::locale::{Locale, get_numeric_locale};
-use crate::wchar::encode_byte_to_char;
+use crate::wchar::{decode_byte_from_char, encode_byte_to_char};
 use crate::wutil::{
    errors::Error,
    wcstod::wcstod,
@@ -665,20 +665,26 @@ fn print_esc(&mut self, escstart: &wstr, octal_0: bool) -> usize {
                uni_value = uni_value * 16 + p.char_at(0).to_digit(16).unwrap();
                p = &p[1..];
            }
-            // N.B. we assume __STDC_ISO_10646__.
-            if uni_value > 0x10FFFF {
-                self.fatal_error(wgettext_fmt!(
-                    "Unicode character out of range: \\%c%0*x",
-                    esc_char,
-                    exp_esc_length,
-                    uni_value
-                ));
-            } else {
-                // TODO-RUST: if uni_value is a surrogate, we need to encode it using our PUA scheme.
-                if let Some(c) = char::from_u32(uni_value) {
-                    self.append_output(c);
-                } else {
-                    self.fatal_error(wgettext!("Invalid code points not yet supported by printf"));
+            match char::from_u32(uni_value) {
+                Some(c) => {
+                    // Test if this character would be treated specially when decoding.
+                    // If so, PUA-encode it.
+                    if decode_byte_from_char(c).is_some() {
+                        // A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
+                        let mut converted = [0_u8; 4];
+                        for byte in c.encode_utf8(&mut converted).as_bytes() {
+                            self.append_output(encode_byte_to_char(*byte));
+                        }
+                    } else {
+                        self.append_output(c);
+                    }
+                }
+                None => {
+                    let escaped_char_string = format!("\\{esc_char}{uni_value:0exp_esc_length$x}");
+                    self.fatal_error(wgettext_fmt!(
+                        "Not a valid Unicode character: %s",
+                        escaped_char_string
+                    ));
                }
            }
        } else {
--- a/tests/checks/printf.fish
+++ b/tests/checks/printf.fish
@@ -172,3 +172,13 @@ printf '|%.1s|\n' '𒈙a'
 #CHECK: |𒈙|
 printf '|%3.3s|\n' '👨‍👨‍👧‍👧'
 #CHECK: | 👨‍👨‍👧‍👧|
+
+# Check handling of chars we use in our internal PUA encoding.
+printf '\uf641' | display_bytes
+# CHECK: 0000000 357 231 201
+# CHECK: 0000003
+
+# UTF-8 representation of \uf641
+printf '%s' \xef\x99\x81 | display_bytes
+# CHECK: 0000000 357 231 201
+# CHECK: 0000003