diff --git a/Cargo.lock b/Cargo.lock index 2d6665f8f..7a1b242f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -126,6 +126,8 @@ name = "fish-printf" version = "0.2.1" dependencies = [ "libc", + "unicode-segmentation", + "unicode-width", "widestring", ] @@ -546,6 +548,18 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "unix_path" version = "1.0.1" diff --git a/printf/Cargo.toml b/printf/Cargo.toml index 09a6413ca..cddbb956b 100644 --- a/printf/Cargo.toml +++ b/printf/Cargo.toml @@ -10,6 +10,8 @@ license = "MIT" [dependencies] libc = "0.2.155" widestring = { version = "1.0.2", optional = true } +unicode-segmentation = "1.12.0" +unicode-width = "0.2.0" [lints] workspace = true diff --git a/printf/src/lib.rs b/printf/src/lib.rs index 733d7a193..b7d4ac5cb 100644 --- a/printf/src/lib.rs +++ b/printf/src/lib.rs @@ -71,7 +71,7 @@ macro_rules! sprintf { /// - `args`: Iterator over the arguments to format. /// /// # Returns -/// A `Result` which is `Ok` containing the number of characters written on success, or an `Error`. +/// A `Result` which is `Ok` containing the width of the string written on success, or an `Error`. /// /// # Example /// diff --git a/printf/src/printf_impl.rs b/printf/src/printf_impl.rs index 8a09137ac..1f26e9264 100644 --- a/printf/src/printf_impl.rs +++ b/printf/src/printf_impl.rs @@ -5,6 +5,8 @@ use std::fmt::{self, Write}; use std::mem; use std::result::Result; +use unicode_segmentation::UnicodeSegmentation; +use unicode_width::UnicodeWidthStr; #[cfg(feature = "widestring")] use widestring::Utf32Str as wstr; @@ -382,7 +384,7 @@ pub fn sprintf_locale( } // Read field width. We do not support $. - let width = if s.at(0) == Some('*') { + let desired_width = if s.at(0) == Some('*') { let arg_width = args.next().ok_or(Error::MissingArg)?.as_sint()?; s.advance_by(1); if arg_width < 0 { @@ -397,7 +399,7 @@ pub fn sprintf_locale( }; // Optionally read precision. We do not support $. - let mut prec: Option = if s.at(0) == Some('.') && s.at(1) == Some('*') { + let mut desired_precision: Option = if s.at(0) == Some('.') && s.at(1) == Some('*') { // "A negative precision is treated as though it were missing." // Here we assume the precision is always signed. s.advance_by(2); @@ -410,7 +412,7 @@ pub fn sprintf_locale( None }; // Disallow precisions larger than i32::MAX, in keeping with C. - if prec.unwrap_or(0) > i32::MAX as usize { + if desired_precision.unwrap_or(0) > i32::MAX as usize { return Err(Error::Overflow); } @@ -429,7 +431,7 @@ pub fn sprintf_locale( // "If a precision is given with a numeric conversion (d, i, o, u, i, x, and X), // the 0 flag is ignored." p is included here. let spec_is_numeric = matches!(conv_spec, CS::d | CS::u | CS::o | CS::p | CS::x | CS::X); - if spec_is_numeric && prec.is_some() { + if spec_is_numeric && desired_precision.is_some() { flags.zero_pad = false; } @@ -443,13 +445,22 @@ pub fn sprintf_locale( CS::e | CS::f | CS::g | CS::a | CS::E | CS::F | CS::G | CS::A => { // Floating point types handle output on their own. let float = arg.as_float()?; - let len = format_float(f, float, width, prec, flags, locale, conv_spec, buf)?; + let len = format_float( + f, + float, + desired_width, + desired_precision, + flags, + locale, + conv_spec, + buf, + )?; out_len = out_len.checked_add(len).ok_or(Error::Overflow)?; continue 'main; } CS::p => { const PTR_HEX_DIGITS: usize = 2 * mem::size_of::<*const u8>(); - prec = prec.map(|p| p.max(PTR_HEX_DIGITS)); + desired_precision = desired_precision.map(|p| p.max(PTR_HEX_DIGITS)); let uint = arg.as_uint()?; if uint != 0 { prefix = "0x"; @@ -479,8 +490,8 @@ pub fn sprintf_locale( if uint != 0 { write!(buf, "{:o}", uint)?; } - if flags.alt_form && prec.unwrap_or(0) <= buf.len() + 1 { - prec = Some(buf.len() + 1); + if flags.alt_form && desired_precision.unwrap_or(0) <= buf.len() + 1 { + desired_precision = Some(buf.len() + 1); } buf } @@ -514,10 +525,38 @@ pub fn sprintf_locale( CS::s => { // also 'S' let s = arg.as_str(buf)?; - let p = prec.unwrap_or(s.len()).min(s.len()); - prec = Some(p); flags.zero_pad = false; - &s[..p] + match desired_precision { + Some(precision) => { + // from man printf(3) + // "the maximum number of characters to be printed from a string" + // We interpret this to mean the maximum width when printed, as defined by + // Unicode grapheme cluster width. + let mut byte_len = 0; + let mut width = 0; + let mut graphemes = s.graphemes(true); + // Iteratively add single grapheme clusters as long as the fit within the + // width limited by precision. + while width < precision { + match graphemes.next() { + Some(grapheme) => { + let grapheme_width = grapheme.width(); + if width + grapheme_width <= precision { + byte_len += grapheme.len(); + width += grapheme_width; + } else { + break; + } + } + None => break, + } + } + let p = precision.min(width); + desired_precision = Some(p); + &s[..byte_len] + } + None => s, + } } }; // Numeric output should be empty iff the value is 0. @@ -528,23 +567,26 @@ pub fn sprintf_locale( // Decide if we want to apply thousands grouping to the body, and compute its size. // Note we have already errored out if grouped is set and this is non-numeric. let wants_grouping = flags.grouped && locale.thousands_sep.is_some(); - let body_len = match wants_grouping { + let body_width = match wants_grouping { + // We assume that text representing numbers is ASCII, so len == width. true => body.len() + locale.separator_count(body.len()), - false => body.len(), + false => body.width(), }; // Resolve the precision. // In the case of a non-numeric conversion, update the precision to at least the // length of the string. - let prec = if !spec_is_numeric { - prec.unwrap_or(body_len) + let desired_precision = if !spec_is_numeric { + desired_precision.unwrap_or(body_width) } else { - prec.unwrap_or(1).max(body_len) + desired_precision.unwrap_or(1).max(body_width) }; - let prefix_len = prefix.len(); - let unpadded_width = prefix_len.checked_add(prec).ok_or(Error::Overflow)?; - let width = width.max(unpadded_width); + let prefix_width = prefix.width(); + let unpadded_width = prefix_width + .checked_add(desired_precision) + .ok_or(Error::Overflow)?; + let width = desired_width.max(unpadded_width); // Pad on the left with spaces to the desired width? if !flags.left_adj && !flags.zero_pad { @@ -560,7 +602,8 @@ pub fn sprintf_locale( } // Pad on the left to the given precision? - pad(f, '0', prec, body_len)?; + // TODO: why pad with 0 here? + pad(f, '0', desired_precision, body_width)?; // Output the actual value, perhaps with grouping. if wants_grouping { diff --git a/printf/src/tests.rs b/printf/src/tests.rs index 85b966af5..036fe0d3f 100644 --- a/printf/src/tests.rs +++ b/printf/src/tests.rs @@ -13,6 +13,7 @@ macro_rules! sprintf_check { $(,)? // optional trailing comma ) => { { + use unicode_width::UnicodeWidthStr; let mut target = String::new(); let mut args = [$($arg.to_arg()),*]; let len = $crate::printf_c_locale( @@ -20,7 +21,7 @@ macro_rules! sprintf_check { $fmt.as_ref() as &str, &mut args, ).expect("printf failed"); - assert!(len == target.len(), "Wrong length returned: {} vs {}", len, target.len()); + assert_eq!(len, target.width(), "Wrong length returned"); target } }; @@ -735,6 +736,18 @@ fn test_huge_precision_g() { sprintf_err!("%.2147483648g", f => Error::Overflow); } +#[test] +fn test_non_ascii() { + assert_fmt!("%3s", "ΓΆ" => " ΓΆ"); + assert_fmt!("%3s", "πŸ‡ΊπŸ‡³" => " πŸ‡ΊπŸ‡³"); + assert_fmt!("%.3s", "πŸ‡ΊπŸ‡³πŸ‡ΊπŸ‡³" => "πŸ‡ΊπŸ‡³"); + assert_fmt!("%.3s", "aπŸ‡ΊπŸ‡³" => "aπŸ‡ΊπŸ‡³"); + assert_fmt!("%.3s", "aaπŸ‡ΊπŸ‡³" => "aa"); + assert_fmt!("%3.3s", "aaπŸ‡ΊπŸ‡³" => " aa"); + assert_fmt!("%.1s", "π’ˆ™a" => "π’ˆ™"); + assert_fmt!("%3.3s", "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§" => " πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§"); +} + #[test] fn test_errors() { use Error::*; diff --git a/tests/checks/printf.fish b/tests/checks/printf.fish index c905a4ebe..99c7a3fd0 100644 --- a/tests/checks/printf.fish +++ b/tests/checks/printf.fish @@ -154,3 +154,21 @@ printf '%b\n' '\0057foo\0057bar\0057' printf %18446744073709551616s # CHECKERR: Number out of range + +# Test non-ASCII behavior +printf '|%3s|\n' 'ΓΆ' +# CHECK: | ΓΆ| +printf '|%3s|\n' 'πŸ‡ΊπŸ‡³' +#CHECK: | πŸ‡ΊπŸ‡³| +printf '|%.3s|\n' 'πŸ‡ΊπŸ‡³πŸ‡ΊπŸ‡³' +#CHECK: |πŸ‡ΊπŸ‡³| +printf '|%.3s|\n' 'aπŸ‡ΊπŸ‡³' +#CHECK: |aπŸ‡ΊπŸ‡³| +printf '|%.3s|\n' 'aaπŸ‡ΊπŸ‡³' +#CHECK: |aa| +printf '|%3.3s|\n' 'aaπŸ‡ΊπŸ‡³' +#CHECK: | aa| +printf '|%.1s|\n' 'π’ˆ™a' +#CHECK: |π’ˆ™| +printf '|%3.3s|\n' 'πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§' +#CHECK: | πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘§|