From 09eae92888602422e06bb623b91684c9d83df7b1 Mon Sep 17 00:00:00 2001 From: Daniel Rainer Date: Sat, 3 May 2025 01:43:53 +0200 Subject: [PATCH] Make printf unicode-aware Specifically, the width and precision format specifiers are interpreted as referring to the width of the grapheme clusters rather than the byte count of the string. Note that grapheme clusters can differ in width. If a precision is specified for a string, meaning its "maximum number of characters", we consider this to limit the width displayed. If there is a grapheme cluster whose width is greater than 1, it might not be possible to get precisely the desired width. In such cases, this last grapheme cluster is excluded from the output. Note that the definitions used here are not consistent with the `string length` builtin at the moment, but this has already been the case. --- Cargo.lock | 14 +++++++ printf/Cargo.toml | 2 + printf/src/lib.rs | 2 +- printf/src/printf_impl.rs | 83 +++++++++++++++++++++++++++++---------- printf/src/tests.rs | 15 ++++++- tests/checks/printf.fish | 18 +++++++++ 6 files changed, 112 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2d6665f8f..7a1b242f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -126,6 +126,8 @@ name = "fish-printf" version = "0.2.1" dependencies = [ "libc", + "unicode-segmentation", + "unicode-width", "widestring", ] @@ -546,6 +548,18 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "unix_path" version = "1.0.1" diff --git a/printf/Cargo.toml b/printf/Cargo.toml index 09a6413ca..cddbb956b 100644 --- a/printf/Cargo.toml +++ b/printf/Cargo.toml @@ -10,6 +10,8 @@ license = "MIT" [dependencies] libc = "0.2.155" widestring = { version = "1.0.2", optional = true } +unicode-segmentation = "1.12.0" +unicode-width = "0.2.0" [lints] workspace = true diff --git a/printf/src/lib.rs b/printf/src/lib.rs index 733d7a193..b7d4ac5cb 100644 --- a/printf/src/lib.rs +++ b/printf/src/lib.rs @@ -71,7 +71,7 @@ macro_rules! sprintf { /// - `args`: Iterator over the arguments to format. /// /// # Returns -/// A `Result` which is `Ok` containing the number of characters written on success, or an `Error`. +/// A `Result` which is `Ok` containing the width of the string written on success, or an `Error`. /// /// # Example /// diff --git a/printf/src/printf_impl.rs b/printf/src/printf_impl.rs index 8a09137ac..1f26e9264 100644 --- a/printf/src/printf_impl.rs +++ b/printf/src/printf_impl.rs @@ -5,6 +5,8 @@ use std::fmt::{self, Write}; use std::mem; use std::result::Result; +use unicode_segmentation::UnicodeSegmentation; +use unicode_width::UnicodeWidthStr; #[cfg(feature = "widestring")] use widestring::Utf32Str as wstr; @@ -382,7 +384,7 @@ pub fn sprintf_locale( } // Read field width. We do not support $. - let width = if s.at(0) == Some('*') { + let desired_width = if s.at(0) == Some('*') { let arg_width = args.next().ok_or(Error::MissingArg)?.as_sint()?; s.advance_by(1); if arg_width < 0 { @@ -397,7 +399,7 @@ pub fn sprintf_locale( }; // Optionally read precision. We do not support $. - let mut prec: Option = if s.at(0) == Some('.') && s.at(1) == Some('*') { + let mut desired_precision: Option = if s.at(0) == Some('.') && s.at(1) == Some('*') { // "A negative precision is treated as though it were missing." // Here we assume the precision is always signed. s.advance_by(2); @@ -410,7 +412,7 @@ pub fn sprintf_locale( None }; // Disallow precisions larger than i32::MAX, in keeping with C. - if prec.unwrap_or(0) > i32::MAX as usize { + if desired_precision.unwrap_or(0) > i32::MAX as usize { return Err(Error::Overflow); } @@ -429,7 +431,7 @@ pub fn sprintf_locale( // "If a precision is given with a numeric conversion (d, i, o, u, i, x, and X), // the 0 flag is ignored." p is included here. let spec_is_numeric = matches!(conv_spec, CS::d | CS::u | CS::o | CS::p | CS::x | CS::X); - if spec_is_numeric && prec.is_some() { + if spec_is_numeric && desired_precision.is_some() { flags.zero_pad = false; } @@ -443,13 +445,22 @@ pub fn sprintf_locale( CS::e | CS::f | CS::g | CS::a | CS::E | CS::F | CS::G | CS::A => { // Floating point types handle output on their own. let float = arg.as_float()?; - let len = format_float(f, float, width, prec, flags, locale, conv_spec, buf)?; + let len = format_float( + f, + float, + desired_width, + desired_precision, + flags, + locale, + conv_spec, + buf, + )?; out_len = out_len.checked_add(len).ok_or(Error::Overflow)?; continue 'main; } CS::p => { const PTR_HEX_DIGITS: usize = 2 * mem::size_of::<*const u8>(); - prec = prec.map(|p| p.max(PTR_HEX_DIGITS)); + desired_precision = desired_precision.map(|p| p.max(PTR_HEX_DIGITS)); let uint = arg.as_uint()?; if uint != 0 { prefix = "0x"; @@ -479,8 +490,8 @@ pub fn sprintf_locale( if uint != 0 { write!(buf, "{:o}", uint)?; } - if flags.alt_form && prec.unwrap_or(0) <= buf.len() + 1 { - prec = Some(buf.len() + 1); + if flags.alt_form && desired_precision.unwrap_or(0) <= buf.len() + 1 { + desired_precision = Some(buf.len() + 1); } buf } @@ -514,10 +525,38 @@ pub fn sprintf_locale( CS::s => { // also 'S' let s = arg.as_str(buf)?; - let p = prec.unwrap_or(s.len()).min(s.len()); - prec = Some(p); flags.zero_pad = false; - &s[..p] + match desired_precision { + Some(precision) => { + // from man printf(3) + // "the maximum number of characters to be printed from a string" + // We interpret this to mean the maximum width when printed, as defined by + // Unicode grapheme cluster width. + let mut byte_len = 0; + let mut width = 0; + let mut graphemes = s.graphemes(true); + // Iteratively add single grapheme clusters as long as the fit within the + // width limited by precision. + while width < precision { + match graphemes.next() { + Some(grapheme) => { + let grapheme_width = grapheme.width(); + if width + grapheme_width <= precision { + byte_len += grapheme.len(); + width += grapheme_width; + } else { + break; + } + } + None => break, + } + } + let p = precision.min(width); + desired_precision = Some(p); + &s[..byte_len] + } + None => s, + } } }; // Numeric output should be empty iff the value is 0. @@ -528,23 +567,26 @@ pub fn sprintf_locale( // Decide if we want to apply thousands grouping to the body, and compute its size. // Note we have already errored out if grouped is set and this is non-numeric. let wants_grouping = flags.grouped && locale.thousands_sep.is_some(); - let body_len = match wants_grouping { + let body_width = match wants_grouping { + // We assume that text representing numbers is ASCII, so len == width. true => body.len() + locale.separator_count(body.len()), - false => body.len(), + false => body.width(), }; // Resolve the precision. // In the case of a non-numeric conversion, update the precision to at least the // length of the string. - let prec = if !spec_is_numeric { - prec.unwrap_or(body_len) + let desired_precision = if !spec_is_numeric { + desired_precision.unwrap_or(body_width) } else { - prec.unwrap_or(1).max(body_len) + desired_precision.unwrap_or(1).max(body_width) }; - let prefix_len = prefix.len(); - let unpadded_width = prefix_len.checked_add(prec).ok_or(Error::Overflow)?; - let width = width.max(unpadded_width); + let prefix_width = prefix.width(); + let unpadded_width = prefix_width + .checked_add(desired_precision) + .ok_or(Error::Overflow)?; + let width = desired_width.max(unpadded_width); // Pad on the left with spaces to the desired width? if !flags.left_adj && !flags.zero_pad { @@ -560,7 +602,8 @@ pub fn sprintf_locale( } // Pad on the left to the given precision? - pad(f, '0', prec, body_len)?; + // TODO: why pad with 0 here? + pad(f, '0', desired_precision, body_width)?; // Output the actual value, perhaps with grouping. if wants_grouping { diff --git a/printf/src/tests.rs b/printf/src/tests.rs index 85b966af5..036fe0d3f 100644 --- a/printf/src/tests.rs +++ b/printf/src/tests.rs @@ -13,6 +13,7 @@ macro_rules! sprintf_check { $(,)? // optional trailing comma ) => { { + use unicode_width::UnicodeWidthStr; let mut target = String::new(); let mut args = [$($arg.to_arg()),*]; let len = $crate::printf_c_locale( @@ -20,7 +21,7 @@ macro_rules! sprintf_check { $fmt.as_ref() as &str, &mut args, ).expect("printf failed"); - assert!(len == target.len(), "Wrong length returned: {} vs {}", len, target.len()); + assert_eq!(len, target.width(), "Wrong length returned"); target } }; @@ -735,6 +736,18 @@ fn test_huge_precision_g() { sprintf_err!("%.2147483648g", f => Error::Overflow); } +#[test] +fn test_non_ascii() { + assert_fmt!("%3s", "ΓΆ" => " ΓΆ"); + assert_fmt!("%3s", "πŸ‡ΊπŸ‡³" => " πŸ‡ΊπŸ‡³"); + assert_fmt!("%.3s", "πŸ‡ΊπŸ‡³πŸ‡ΊπŸ‡³" => "πŸ‡ΊπŸ‡³"); + assert_fmt!("%.3s", "aπŸ‡ΊπŸ‡³" => "aπŸ‡ΊπŸ‡³"); + assert_fmt!("%.3s", "aaπŸ‡ΊπŸ‡³" => "aa"); + assert_fmt!("%3.3s", "aaπŸ‡ΊπŸ‡³" => " aa"); + assert_fmt!("%.1s", "π’ˆ™a" => "π’ˆ™"); + assert_fmt!("%3.3s", "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§" => " πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§"); +} + #[test] fn test_errors() { use Error::*; diff --git a/tests/checks/printf.fish b/tests/checks/printf.fish index c905a4ebe..99c7a3fd0 100644 --- a/tests/checks/printf.fish +++ b/tests/checks/printf.fish @@ -154,3 +154,21 @@ printf '%b\n' '\0057foo\0057bar\0057' printf %18446744073709551616s # CHECKERR: Number out of range + +# Test non-ASCII behavior +printf '|%3s|\n' 'ΓΆ' +# CHECK: | ΓΆ| +printf '|%3s|\n' 'πŸ‡ΊπŸ‡³' +#CHECK: | πŸ‡ΊπŸ‡³| +printf '|%.3s|\n' 'πŸ‡ΊπŸ‡³πŸ‡ΊπŸ‡³' +#CHECK: |πŸ‡ΊπŸ‡³| +printf '|%.3s|\n' 'aπŸ‡ΊπŸ‡³' +#CHECK: |aπŸ‡ΊπŸ‡³| +printf '|%.3s|\n' 'aaπŸ‡ΊπŸ‡³' +#CHECK: |aa| +printf '|%3.3s|\n' 'aaπŸ‡ΊπŸ‡³' +#CHECK: | aa| +printf '|%.1s|\n' 'π’ˆ™a' +#CHECK: |π’ˆ™| +printf '|%3.3s|\n' 'πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§' +#CHECK: | πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§|