From 2d29749eaec18510abad02f290e9366af0736aed Mon Sep 17 00:00:00 2001 From: Daniel Rainer Date: Fri, 28 Nov 2025 01:58:27 +0100 Subject: [PATCH] printf: improve Unicode escape + PUA handling The old handling of Unicode escape sequences seems partially obsolete and unnecessarily complicated. For our purposes, Rust's u32 to char parsing should be exactly what we want. Due to fish treating certain code points specially, we need to check if the provided character is among the special chars, and if so we need to encode it using our PUA scheme. This was not done in the old code, but is now. Add tests for this. Fixes #12081 Rework the error message for invalid code points. The old message about invalid code points not yet being supported seems odd. I don't think we should support this, so stop implying that we might do so in the future. In the new code, indicating that a Unicode character is out of range is also not ideal, since the range is not contiguous. E.g. `\uD800` is invalid, but \U0010FFFF is valid. Refrain from referring to a "range" and instead just state that the character is invalid. Move formatting of the escape sequence into Rust's `format!` to simplify porting to Fluent. Closes #12118 --- po/de.po | 11 ++++------- po/en.po | 11 ++++------- po/fr.po | 11 ++++------- po/pl.po | 11 ++++------- po/pt_BR.po | 11 ++++------- po/sv.po | 11 ++++------- po/zh_CN.po | 11 ++++------- po/zh_TW.po | 11 ++++------- src/builtins/printf.rs | 36 +++++++++++++++++++++--------------- tests/checks/printf.fish | 10 ++++++++++ 10 files changed, 63 insertions(+), 71 deletions(-) diff --git a/po/de.po b/po/de.po index c89ddb22f..5f1176ccf 100644 --- a/po/de.po +++ b/po/de.po @@ -1187,9 +1187,6 @@ msgstr "" msgid "Invalid arguments" msgstr "Ungültige Argumente" -msgid "Invalid code points not yet supported by printf" -msgstr "Ungültige Codepunkte werden von printf noch nicht unterstützt" - msgid "Invalid index value" msgstr "Ungültiger Indexwert" @@ -1292,6 +1289,10 @@ msgstr "Keine Funktion" msgid "Not a number" msgstr "Keine Zahl" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "Kein gültiges Unicodezeichen: %s" + msgid "Notifications about universal variable changes" msgstr "" @@ -1638,10 +1639,6 @@ msgstr "Unerwartetes Stringende, eckige Klammern passen nicht" msgid "Unexpected token" msgstr "Unerwartetes Token" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "Unicodezeichen außerhalb des gültigen Bereichs: \\%c%0*x" - msgid "Unknown" msgstr "Unbekannt" diff --git a/po/en.po b/po/en.po index 626b35b88..410bf4c46 100644 --- a/po/en.po +++ b/po/en.po @@ -1185,9 +1185,6 @@ msgstr "" msgid "Invalid arguments" msgstr "" -msgid "Invalid code points not yet supported by printf" -msgstr "" - msgid "Invalid index value" msgstr "" @@ -1290,6 +1287,10 @@ msgstr "" msgid "Not a number" msgstr "" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "Not a valid Unicode character: %s" + msgid "Notifications about universal variable changes" msgstr "" @@ -1636,10 +1637,6 @@ msgstr "Unexpected end of string, square brackets do not match" msgid "Unexpected token" msgstr "" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "Unicode character out of range: \\%c%0*x" - msgid "Unknown" msgstr "Unknown" diff --git a/po/fr.po b/po/fr.po index 1b0ef73ed..4f1b521f4 100644 --- a/po/fr.po +++ b/po/fr.po @@ -1316,9 +1316,6 @@ msgstr "" msgid "Invalid arguments" msgstr "" -msgid "Invalid code points not yet supported by printf" -msgstr "" - msgid "Invalid index value" msgstr "" @@ -1421,6 +1418,10 @@ msgstr "Pas une fonction" msgid "Not a number" msgstr "" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "" + msgid "Notifications about universal variable changes" msgstr "" @@ -1767,10 +1768,6 @@ msgstr "Fin de chaîne inattendue, les crochets ne sont pas refermés" msgid "Unexpected token" msgstr "" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "Caractère Unicode hors limite: \\%c%0*x" - msgid "Unknown" msgstr "Inconnu" diff --git a/po/pl.po b/po/pl.po index c7fe70ab3..8f06144fc 100644 --- a/po/pl.po +++ b/po/pl.po @@ -1181,9 +1181,6 @@ msgstr "" msgid "Invalid arguments" msgstr "" -msgid "Invalid code points not yet supported by printf" -msgstr "" - msgid "Invalid index value" msgstr "" @@ -1286,6 +1283,10 @@ msgstr "" msgid "Not a number" msgstr "" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "" + msgid "Notifications about universal variable changes" msgstr "" @@ -1632,10 +1633,6 @@ msgstr "" msgid "Unexpected token" msgstr "" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "" - msgid "Unknown" msgstr "Nieznany" diff --git a/po/pt_BR.po b/po/pt_BR.po index cd2e1d5e9..38cd39d36 100644 --- a/po/pt_BR.po +++ b/po/pt_BR.po @@ -1186,9 +1186,6 @@ msgstr "" msgid "Invalid arguments" msgstr "" -msgid "Invalid code points not yet supported by printf" -msgstr "" - msgid "Invalid index value" msgstr "" @@ -1291,6 +1288,10 @@ msgstr "" msgid "Not a number" msgstr "" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "" + msgid "Notifications about universal variable changes" msgstr "" @@ -1637,10 +1638,6 @@ msgstr "Final inesperado de string, colchetes não batem" msgid "Unexpected token" msgstr "" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "Caracter Unicode fora dos limites: \\%c%0*x" - msgid "Unknown" msgstr "Desconhecido" diff --git a/po/sv.po b/po/sv.po index 0599993c3..a5c3d130a 100644 --- a/po/sv.po +++ b/po/sv.po @@ -1182,9 +1182,6 @@ msgstr "" msgid "Invalid arguments" msgstr "" -msgid "Invalid code points not yet supported by printf" -msgstr "" - msgid "Invalid index value" msgstr "" @@ -1287,6 +1284,10 @@ msgstr "" msgid "Not a number" msgstr "" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "" + msgid "Notifications about universal variable changes" msgstr "" @@ -1633,10 +1634,6 @@ msgstr "" msgid "Unexpected token" msgstr "" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "Unicodetecken utanför giltigt spann: \\%c%0*x" - msgid "Unknown" msgstr "Okänd" diff --git a/po/zh_CN.po b/po/zh_CN.po index c15cd478f..63518c890 100644 --- a/po/zh_CN.po +++ b/po/zh_CN.po @@ -1211,9 +1211,6 @@ msgstr "主题监视器的内部细节" msgid "Invalid arguments" msgstr "无效参数" -msgid "Invalid code points not yet supported by printf" -msgstr "尚未由 printf 支持的无效码点" - msgid "Invalid index value" msgstr "无效索引值" @@ -1316,6 +1313,10 @@ msgstr "不是一个函数" msgid "Not a number" msgstr "非数字" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "" + msgid "Notifications about universal variable changes" msgstr "通用变量变更通知" @@ -1662,10 +1663,6 @@ msgstr "字符串意外结束,方括号不匹配" msgid "Unexpected token" msgstr "意外的记号" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "Unicode 字符超出范围:\\%c%0*x" - msgid "Unknown" msgstr "未知" diff --git a/po/zh_TW.po b/po/zh_TW.po index ee2a6bd62..6e3ae8562 100644 --- a/po/zh_TW.po +++ b/po/zh_TW.po @@ -1185,9 +1185,6 @@ msgstr "主題監聽的內部資訊" msgid "Invalid arguments" msgstr "引數無效" -msgid "Invalid code points not yet supported by printf" -msgstr "printf 尚不支援無效的碼位" - msgid "Invalid index value" msgstr "索引值無效" @@ -1290,6 +1287,10 @@ msgstr "不是函式" msgid "Not a number" msgstr "不是數字" +#, c-format +msgid "Not a valid Unicode character: %s" +msgstr "" + msgid "Notifications about universal variable changes" msgstr "通域變數變更通知" @@ -1637,10 +1638,6 @@ msgstr "非預期的字串結尾,中括弧不對稱" msgid "Unexpected token" msgstr "非預期的詞元" -#, c-format -msgid "Unicode character out of range: \\%c%0*x" -msgstr "Unicode 字元超出範圍:\\%c%0*x" - msgid "Unknown" msgstr "未知" diff --git a/src/builtins/printf.rs b/src/builtins/printf.rs index f01a52e02..53f6b1be3 100644 --- a/src/builtins/printf.rs +++ b/src/builtins/printf.rs @@ -50,7 +50,7 @@ use super::prelude::*; use crate::locale::{Locale, get_numeric_locale}; -use crate::wchar::encode_byte_to_char; +use crate::wchar::{decode_byte_from_char, encode_byte_to_char}; use crate::wutil::{ errors::Error, wcstod::wcstod, @@ -665,20 +665,26 @@ fn print_esc(&mut self, escstart: &wstr, octal_0: bool) -> usize { uni_value = uni_value * 16 + p.char_at(0).to_digit(16).unwrap(); p = &p[1..]; } - // N.B. we assume __STDC_ISO_10646__. - if uni_value > 0x10FFFF { - self.fatal_error(wgettext_fmt!( - "Unicode character out of range: \\%c%0*x", - esc_char, - exp_esc_length, - uni_value - )); - } else { - // TODO-RUST: if uni_value is a surrogate, we need to encode it using our PUA scheme. - if let Some(c) = char::from_u32(uni_value) { - self.append_output(c); - } else { - self.fatal_error(wgettext!("Invalid code points not yet supported by printf")); + match char::from_u32(uni_value) { + Some(c) => { + // Test if this character would be treated specially when decoding. + // If so, PUA-encode it. + if decode_byte_from_char(c).is_some() { + // A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8. + let mut converted = [0_u8; 4]; + for byte in c.encode_utf8(&mut converted).as_bytes() { + self.append_output(encode_byte_to_char(*byte)); + } + } else { + self.append_output(c); + } + } + None => { + let escaped_char_string = format!("\\{esc_char}{uni_value:0exp_esc_length$x}"); + self.fatal_error(wgettext_fmt!( + "Not a valid Unicode character: %s", + escaped_char_string + )); } } } else { diff --git a/tests/checks/printf.fish b/tests/checks/printf.fish index 06aa07a22..72c6038f1 100644 --- a/tests/checks/printf.fish +++ b/tests/checks/printf.fish @@ -172,3 +172,13 @@ printf '|%.1s|\n' '𒈙a' #CHECK: |𒈙| printf '|%3.3s|\n' '👨‍👨‍👧‍👧' #CHECK: | 👨‍👨‍👧‍👧| + +# Check handling of chars we use in our internal PUA encoding. +printf '\uf641' | display_bytes +# CHECK: 0000000 357 231 201 +# CHECK: 0000003 + +# UTF-8 representation of \uf641 +printf '%s' \xef\x99\x81 | display_bytes +# CHECK: 0000000 357 231 201 +# CHECK: 0000003