printf: improve Unicode escape + PUA handling

The old handling of Unicode escape sequences seems partially obsolete
and unnecessarily complicated. For our purposes, Rust's u32 to char
parsing should be exactly what we want.

Due to fish treating certain code points specially, we need to check
if the provided character is among the special chars, and if so we need
to encode it using our PUA scheme. This was not done in the old code,
but is now.
Add tests for this.
Fixes #12081

Rework the error message for invalid code points.
The old message about invalid code points not yet being supported seems
odd. I don't think we should support this, so stop implying that we
might do so in the future.
In the new code, indicating that a Unicode character is
out of range is also not ideal, since the range is not contiguous.
E.g. `\uD800` is invalid, but \U0010FFFF is valid.
Refrain from referring to a "range" and instead just state that the
character is invalid.

Move formatting of the escape sequence into Rust's `format!` to simplify
porting to Fluent.

Closes #12118
This commit is contained in:
Daniel Rainer
2025-11-28 01:58:27 +01:00
committed by Johannes Altmanninger
parent 22a252d064
commit 2d29749eae
10 changed files with 63 additions and 71 deletions

View File

@@ -1187,9 +1187,6 @@ msgstr ""
msgid "Invalid arguments"
msgstr "Ungültige Argumente"
msgid "Invalid code points not yet supported by printf"
msgstr "Ungültige Codepunkte werden von printf noch nicht unterstützt"
msgid "Invalid index value"
msgstr "Ungültiger Indexwert"
@@ -1292,6 +1289,10 @@ msgstr "Keine Funktion"
msgid "Not a number"
msgstr "Keine Zahl"
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr "Kein gültiges Unicodezeichen: %s"
msgid "Notifications about universal variable changes"
msgstr ""
@@ -1638,10 +1639,6 @@ msgstr "Unerwartetes Stringende, eckige Klammern passen nicht"
msgid "Unexpected token"
msgstr "Unerwartetes Token"
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr "Unicodezeichen außerhalb des gültigen Bereichs: \\%c%0*x"
msgid "Unknown"
msgstr "Unbekannt"

View File

@@ -1185,9 +1185,6 @@ msgstr ""
msgid "Invalid arguments"
msgstr ""
msgid "Invalid code points not yet supported by printf"
msgstr ""
msgid "Invalid index value"
msgstr ""
@@ -1290,6 +1287,10 @@ msgstr ""
msgid "Not a number"
msgstr ""
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr "Not a valid Unicode character: %s"
msgid "Notifications about universal variable changes"
msgstr ""
@@ -1636,10 +1637,6 @@ msgstr "Unexpected end of string, square brackets do not match"
msgid "Unexpected token"
msgstr ""
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr "Unicode character out of range: \\%c%0*x"
msgid "Unknown"
msgstr "Unknown"

View File

@@ -1316,9 +1316,6 @@ msgstr ""
msgid "Invalid arguments"
msgstr ""
msgid "Invalid code points not yet supported by printf"
msgstr ""
msgid "Invalid index value"
msgstr ""
@@ -1421,6 +1418,10 @@ msgstr "Pas une fonction"
msgid "Not a number"
msgstr ""
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr ""
msgid "Notifications about universal variable changes"
msgstr ""
@@ -1767,10 +1768,6 @@ msgstr "Fin de chaîne inattendue, les crochets ne sont pas refermés"
msgid "Unexpected token"
msgstr ""
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr "Caractère Unicode hors limite: \\%c%0*x"
msgid "Unknown"
msgstr "Inconnu"

View File

@@ -1181,9 +1181,6 @@ msgstr ""
msgid "Invalid arguments"
msgstr ""
msgid "Invalid code points not yet supported by printf"
msgstr ""
msgid "Invalid index value"
msgstr ""
@@ -1286,6 +1283,10 @@ msgstr ""
msgid "Not a number"
msgstr ""
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr ""
msgid "Notifications about universal variable changes"
msgstr ""
@@ -1632,10 +1633,6 @@ msgstr ""
msgid "Unexpected token"
msgstr ""
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr ""
msgid "Unknown"
msgstr "Nieznany"

View File

@@ -1186,9 +1186,6 @@ msgstr ""
msgid "Invalid arguments"
msgstr ""
msgid "Invalid code points not yet supported by printf"
msgstr ""
msgid "Invalid index value"
msgstr ""
@@ -1291,6 +1288,10 @@ msgstr ""
msgid "Not a number"
msgstr ""
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr ""
msgid "Notifications about universal variable changes"
msgstr ""
@@ -1637,10 +1638,6 @@ msgstr "Final inesperado de string, colchetes não batem"
msgid "Unexpected token"
msgstr ""
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr "Caracter Unicode fora dos limites: \\%c%0*x"
msgid "Unknown"
msgstr "Desconhecido"

View File

@@ -1182,9 +1182,6 @@ msgstr ""
msgid "Invalid arguments"
msgstr ""
msgid "Invalid code points not yet supported by printf"
msgstr ""
msgid "Invalid index value"
msgstr ""
@@ -1287,6 +1284,10 @@ msgstr ""
msgid "Not a number"
msgstr ""
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr ""
msgid "Notifications about universal variable changes"
msgstr ""
@@ -1633,10 +1634,6 @@ msgstr ""
msgid "Unexpected token"
msgstr ""
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr "Unicodetecken utanför giltigt spann: \\%c%0*x"
msgid "Unknown"
msgstr "Okänd"

View File

@@ -1211,9 +1211,6 @@ msgstr "主题监视器的内部细节"
msgid "Invalid arguments"
msgstr "无效参数"
msgid "Invalid code points not yet supported by printf"
msgstr "尚未由 printf 支持的无效码点"
msgid "Invalid index value"
msgstr "无效索引值"
@@ -1316,6 +1313,10 @@ msgstr "不是一个函数"
msgid "Not a number"
msgstr "非数字"
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr ""
msgid "Notifications about universal variable changes"
msgstr "通用变量变更通知"
@@ -1662,10 +1663,6 @@ msgstr "字符串意外结束,方括号不匹配"
msgid "Unexpected token"
msgstr "意外的记号"
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr "Unicode 字符超出范围:\\%c%0*x"
msgid "Unknown"
msgstr "未知"

View File

@@ -1185,9 +1185,6 @@ msgstr "主題監聽的內部資訊"
msgid "Invalid arguments"
msgstr "引數無效"
msgid "Invalid code points not yet supported by printf"
msgstr "printf 尚不支援無效的碼位"
msgid "Invalid index value"
msgstr "索引值無效"
@@ -1290,6 +1287,10 @@ msgstr "不是函式"
msgid "Not a number"
msgstr "不是數字"
#, c-format
msgid "Not a valid Unicode character: %s"
msgstr ""
msgid "Notifications about universal variable changes"
msgstr "通域變數變更通知"
@@ -1637,10 +1638,6 @@ msgstr "非預期的字串結尾,中括弧不對稱"
msgid "Unexpected token"
msgstr "非預期的詞元"
#, c-format
msgid "Unicode character out of range: \\%c%0*x"
msgstr "Unicode 字元超出範圍:\\%c%0*x"
msgid "Unknown"
msgstr "未知"

View File

@@ -50,7 +50,7 @@
use super::prelude::*;
use crate::locale::{Locale, get_numeric_locale};
use crate::wchar::encode_byte_to_char;
use crate::wchar::{decode_byte_from_char, encode_byte_to_char};
use crate::wutil::{
errors::Error,
wcstod::wcstod,
@@ -665,20 +665,26 @@ fn print_esc(&mut self, escstart: &wstr, octal_0: bool) -> usize {
uni_value = uni_value * 16 + p.char_at(0).to_digit(16).unwrap();
p = &p[1..];
}
// N.B. we assume __STDC_ISO_10646__.
if uni_value > 0x10FFFF {
self.fatal_error(wgettext_fmt!(
"Unicode character out of range: \\%c%0*x",
esc_char,
exp_esc_length,
uni_value
));
} else {
// TODO-RUST: if uni_value is a surrogate, we need to encode it using our PUA scheme.
if let Some(c) = char::from_u32(uni_value) {
self.append_output(c);
} else {
self.fatal_error(wgettext!("Invalid code points not yet supported by printf"));
match char::from_u32(uni_value) {
Some(c) => {
// Test if this character would be treated specially when decoding.
// If so, PUA-encode it.
if decode_byte_from_char(c).is_some() {
// A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
let mut converted = [0_u8; 4];
for byte in c.encode_utf8(&mut converted).as_bytes() {
self.append_output(encode_byte_to_char(*byte));
}
} else {
self.append_output(c);
}
}
None => {
let escaped_char_string = format!("\\{esc_char}{uni_value:0exp_esc_length$x}");
self.fatal_error(wgettext_fmt!(
"Not a valid Unicode character: %s",
escaped_char_string
));
}
}
} else {

View File

@@ -172,3 +172,13 @@ printf '|%.1s|\n' '𒈙a'
#CHECK: |𒈙|
printf '|%3.3s|\n' '👨‍👨‍👧‍👧'
#CHECK: | 👨‍👨‍👧‍👧|
# Check handling of chars we use in our internal PUA encoding.
printf '\uf641' | display_bytes
# CHECK: 0000000 357 231 201
# CHECK: 0000003
# UTF-8 representation of \uf641
printf '%s' \xef\x99\x81 | display_bytes
# CHECK: 0000000 357 231 201
# CHECK: 0000003