diff --git a/CHANGELOG.rst b/CHANGELOG.rst index eaf3fff07..3739725f4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,8 @@ Notable improvements and fixes Deprecations and removed features --------------------------------- +- Fish now assumes UTF-8 everywhere, regardless of locale settings. Input bytes which are not valid UTF-8 should still be round-tripped correctly. + Interactive improvements ------------------------ - :doc:`fish_config prompt {choose,save} ` have been taught to reset :doc:`fish_mode_prompt ` in addition to the other prompt functions (:issue:`11937`). diff --git a/src/builtins/read.rs b/src/builtins/read.rs index 840cc5b23..a4c952c57 100644 --- a/src/builtins/read.rs +++ b/src/builtins/read.rs @@ -13,7 +13,7 @@ use crate::env::{EnvVar, EnvVarFlags}; use crate::input_common::DecodeState; use crate::input_common::InvalidPolicy; -use crate::input_common::decode_input_byte; +use crate::input_common::decode_one_codepoint_utf8; use crate::nix::isatty; use crate::reader::ReaderConfig; use crate::reader::commandline_set_buffer; @@ -27,7 +27,6 @@ use crate::wcstringutil::split_about; use crate::wcstringutil::split_string_tok; use crate::wutil; -use crate::wutil::encoding::zero_mbstate; use crate::wutil::perror; use libc::SEEK_CUR; use std::num::NonZeroUsize; @@ -389,8 +388,6 @@ fn read_one_char_at_a_time( let mut unconsumed = vec![]; loop { - let mut state = zero_mbstate(); - let chars_read = buff.len(); let res = loop { let mut b = [0_u8; 1]; @@ -400,17 +397,9 @@ fn read_one_char_at_a_time( } _ => {} } - let b = b[0]; - unconsumed.push(b); + unconsumed.push(b[0]); nbytes += 1; - let mut consumed = 0; - match decode_input_byte( - buff, - InvalidPolicy::Passthrough, - &mut state, - &unconsumed, - &mut consumed, - ) { + match decode_one_codepoint_utf8(buff, InvalidPolicy::Passthrough, &unconsumed) { DecodeState::Incomplete => continue, DecodeState::Complete => { unconsumed.clear(); diff --git a/src/common.rs b/src/common.rs index 09797fb02..a02be686a 100644 --- a/src/common.rs +++ b/src/common.rs @@ -16,9 +16,6 @@ use crate::wchar::{decode_byte_from_char, encode_byte_to_char, prelude::*}; use crate::wcstringutil::wcs2bytes_callback; use crate::wildcard::{ANY_CHAR, ANY_STRING, ANY_STRING_RECURSIVE}; -use crate::wutil::encoding::{ - AT_LEAST_MB_LEN_MAX, mbrtowc, probe_is_multibyte_locale, wcrtomb, zero_mbstate, -}; use crate::wutil::fish_iswalnum; use bitflags::bitflags; use libc::{SIG_IGN, SIGTTOU, STDIN_FILENO}; @@ -190,7 +187,7 @@ fn escape_string_script(input: &wstr, flags: EscapeFlags) -> WString { let no_quoted = flags.contains(EscapeFlags::NO_QUOTED); let no_tilde = flags.contains(EscapeFlags::NO_TILDE); let no_qmark = feature_test(FeatureFlag::qmark_noglob); - let symbolic = flags.contains(EscapeFlags::SYMBOLIC) && get_is_multibyte_locale(); + let symbolic = flags.contains(EscapeFlags::SYMBOLIC); assert!( !symbolic || !escape_printables, @@ -1035,19 +1032,15 @@ pub fn shell_modes() -> MutexGuard<'static, libc::termios> { /// The character to use where the text has been truncated. Is an ellipsis on unicode system and a $ /// on other systems. pub fn get_ellipsis_char() -> char { - char::from_u32(ELLIPSIS_CHAR.load(Ordering::Relaxed)).unwrap() + '\u{2026}' } -static ELLIPSIS_CHAR: AtomicU32 = AtomicU32::new(0); - /// The character or string to use where text has been truncated (ellipsis if possible, otherwise /// ...) pub fn get_ellipsis_str() -> &'static wstr { - ELLIPSIS_STRING.load() + L!("\u{2026}") } -static ELLIPSIS_STRING: AtomicRef = AtomicRef::new(&L!("")); - /// Character representing an omitted newline at the end of text. pub fn get_omitted_newline_str() -> &'static wstr { OMITTED_NEWLINE_STR.load() @@ -1065,13 +1058,6 @@ pub fn get_obfuscation_read_char() -> char { char::from_u32(OBFUSCATION_READ_CHAR.load(Ordering::Relaxed)).unwrap() } -static IS_MB_LOCALE: RelaxedAtomicBool = RelaxedAtomicBool::new(false); - -/// Whether we believe we are in a multibyte locale. -pub fn get_is_multibyte_locale() -> bool { - IS_MB_LOCALE.load() -} - /// Profiling flag. True if commands should be profiled. pub static PROFILING_ACTIVE: RelaxedAtomicBool = RelaxedAtomicBool::new(false); @@ -1102,84 +1088,54 @@ pub fn has_working_tty_timestamps() -> bool { /// todo!("Maybe remove the box? It is only needed for get_bg_context.") pub type CancelChecker = Box bool>; -/// Converts the narrow character string \c in into its wide equivalent, and return it. -/// -/// The string may contain embedded nulls. -/// -/// This function encodes illegal character sequences in a reversible way using the private use -/// area. -pub fn bytes2wcstring(inp: &[u8]) -> WString { - if inp.is_empty() { +/// Encodes the bytes in `input` into a [`WString`], encoding non-UTF-8 bytes into private-use-area +/// code-points. Bytes which would be parsed into our reserved PUA range are encoded individually, +/// to allow for correct round-tripping. +pub fn bytes2wcstring(mut input: &[u8]) -> WString { + if input.is_empty() { return WString::new(); } let mut result = WString::new(); - result.reserve(inp.len()); - let mut pos = 0; - let mut state = zero_mbstate(); - while pos < inp.len() { - // Append any initial sequence of ascii characters. - // Note we do not support character sets which are not supersets of ASCII. - let ascii_prefix_length = count_ascii_prefix(&inp[pos..]); - result.push_str(std::str::from_utf8(&inp[pos..pos + ascii_prefix_length]).unwrap()); - pos += ascii_prefix_length; - assert!(pos <= inp.len(), "Position overflowed length"); - if pos == inp.len() { - break; - } - // We have found a non-ASCII character. - let mut ret = 0; - let mut c = '\0'; - - let use_encode_direct = if inp[pos] & 0xF8 == 0xF8 { - // Protect against broken mbrtowc() implementations which attempt to encode UTF-8 - // sequences longer than four bytes (e.g., OS X Snow Leopard). - // TODO This check used to be conditionally compiled only on affected platforms. - true - } else { - let mut codepoint = u32::from(c); - ret = unsafe { - mbrtowc( - std::ptr::addr_of_mut!(codepoint), - std::ptr::addr_of!(inp[pos]).cast(), - inp.len() - pos, - &mut state, - ) - }; - match char::from_u32(codepoint) { - Some(codepoint) => { - c = codepoint; - // Determine whether to encode this character with our crazy scheme. - fish_reserved_codepoint(c) - || - // Incomplete sequence. - ret == 0_usize.wrapping_sub(2) - || - // Invalid data. - ret == 0_usize.wrapping_sub(1) - || - // Other error codes? Terrifying, should never happen. - ret > inp.len() - pos + fn append_escaped_str(output: &mut WString, input: &str) { + for (i, c) in input.char_indices() { + if fish_reserved_codepoint(c) { + for byte in &input.as_bytes()[i..i + c.len_utf8()] { + output.push(encode_byte_to_char(*byte)); } - None => true, + } else { + output.push(c); } - }; + } + } - if use_encode_direct { - c = encode_byte_to_char(inp[pos]); - result.push(c); - pos += 1; - state = zero_mbstate(); - } else if ret == 0 { - // embedded null byte! - result.push('\0'); - pos += 1; - state = zero_mbstate(); - } else { - // normal case - result.push(c); - pos += ret; + while !input.is_empty() { + match std::str::from_utf8(input) { + Ok(parsed_str) => { + append_escaped_str(&mut result, parsed_str); + // The entire remaining input could be parsed, so we are done. + break; + } + Err(e) => { + let (valid, after_valid) = input.split_at(e.valid_up_to()); + // SAFETY: The previous `str::from_utf8` call established that the prefix `valid` + // is valid UTF-8. This prefix may be empty. + let parsed_str = unsafe { std::str::from_utf8_unchecked(valid) }; + append_escaped_str(&mut result, parsed_str); + // The length of the prefix of `after_valid` which is invalid UTF-8. + // The remaining bytes of `input` (if any) will be parsed in subsequent iterations + // of the loop, starting from the first byte that starts a valid UTF-8-encoded codepoint. + // `error_len` can return `None`, if it sees a byte sequence that could be the + // prefix of a valid code-point encoding at the end of the byte slice. + // This is useful when the input is chunked, but we don't do that, so in this case + // we use our custom encoding for all remaining bytes (at most 3). + let error_len = e.error_len().unwrap_or(after_valid.len()); + for byte in &after_valid[..error_len] { + result.push(encode_byte_to_char(*byte)); + } + input = &after_valid[error_len..]; + } } } result @@ -1265,12 +1221,6 @@ pub fn wcs2bytes_appending(output: &mut Vec, input: &wstr) { }); } -/// Return the count of initial characters in `in` which are ASCII. -fn count_ascii_prefix(inp: &[u8]) -> usize { - // The C++ version had manual vectorization. - inp.iter().take_while(|c| c.is_ascii()).count() -} - // Check if we are running in the test mode, where we should suppress error output pub const TESTS_PROGRAM_NAME: &wstr = L!("(ignore)"); @@ -1302,22 +1252,6 @@ macro_rules! LL { }}; } - // Mark if we are a multibyte locale. - IS_MB_LOCALE.store(probe_is_multibyte_locale()); - - // Use various Unicode symbols if they can be encoded using the current locale, else a simple - // ASCII char alternative. All of the can_be_encoded() invocations should return the same - // true/false value since the code points are in the BMP but we're going to be paranoid. This - // is also technically wrong if we're not in a Unicode locale but we expect (or hope) - // can_be_encoded() will return false in that case. - if can_be_encoded('\u{2026}') { - ELLIPSIS_CHAR.store(u32::from('\u{2026}'), Ordering::Relaxed); - ELLIPSIS_STRING.store(LL!("\u{2026}")); - } else { - ELLIPSIS_CHAR.store(u32::from('$'), Ordering::Relaxed); // "horizontal ellipsis" - ELLIPSIS_STRING.store(LL!("...")); - } - if is_windows_subsystem_for_linux(WSL::Any) { // neither of \u23CE and \u25CF can be displayed in the default fonts on Windows, though // they can be *encoded* just fine. Use alternative glyphs. @@ -1327,29 +1261,16 @@ macro_rules! LL { OMITTED_NEWLINE_STR.store(LL!("^J")); OBFUSCATION_READ_CHAR.store(u32::from('*'), Ordering::Relaxed); } else { - if can_be_encoded('\u{23CE}') { - OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎) - } else { - OMITTED_NEWLINE_STR.store(LL!("^J")); - } + OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎) OBFUSCATION_READ_CHAR.store( - u32::from(if can_be_encoded('\u{25CF}') { - '\u{25CF}' // "black circle" - } else { - '#' - }), + u32::from( + '\u{25CF}', // "black circle" + ), Ordering::Relaxed, ); } } -/// Test if the character can be encoded using the current locale. -fn can_be_encoded(wc: char) -> bool { - let mut converted = [0 as libc::c_char; AT_LEAST_MB_LEN_MAX]; - let mut state = zero_mbstate(); - unsafe { wcrtomb(converted.as_mut_ptr(), wc as u32, &mut state) != 0_usize.wrapping_sub(1) } -} - /// Call read, blocking and repeating on EINTR. Exits on EAGAIN. /// Return the number of bytes read, or 0 on EOF, or an error. pub fn read_blocked(fd: RawFd, buf: &mut [u8]) -> nix::Result { diff --git a/src/env_dispatch.rs b/src/env_dispatch.rs index 4c282bf2a..e4782ead7 100644 --- a/src/env_dispatch.rs +++ b/src/env_dispatch.rs @@ -14,7 +14,6 @@ use crate::terminal::use_terminfo; use crate::tty_handoff::xtversion; use crate::wchar::prelude::*; -use crate::wutil::encoding::probe_is_multibyte_locale; use crate::wutil::fish_wcstoi; use crate::{function, terminal}; use std::borrow::Cow; @@ -25,12 +24,16 @@ /// List of all locale environment variable names that might trigger (re)initializing of the locale /// subsystem. These are only the variables we're possibly interested in. -#[rustfmt::skip] -const LOCALE_VARIABLES: [&wstr; 10] = [ - L!("LANG"), L!("LANGUAGE"), L!("LC_ALL"), - L!("LC_COLLATE"), L!("LC_CTYPE"), L!("LC_MESSAGES"), - L!("LC_NUMERIC"), L!("LC_TIME"), L!("LOCPATH"), - L!("fish_allow_singlebyte_locale"), +const LOCALE_VARIABLES: [&wstr; 9] = [ + L!("LANG"), + L!("LANGUAGE"), + L!("LC_ALL"), + L!("LC_COLLATE"), + L!("LC_CTYPE"), + L!("LC_MESSAGES"), + L!("LC_NUMERIC"), + L!("LC_TIME"), + L!("LOCPATH"), ]; #[rustfmt::skip] @@ -299,8 +302,6 @@ fn handle_tz_change(var_name: &wstr, vars: &EnvStack) { fn handle_locale_change(vars: &EnvStack) { init_locale(vars); - // We need to re-guess emoji width because the locale might have changed to a multibyte one. - guess_emoji_width(vars); } fn handle_term_change(vars: &EnvStack) { @@ -502,11 +503,6 @@ pub fn read_terminfo_database(vars: &EnvStack) { fn init_locale(vars: &EnvStack) { let _guard = crate::locale::LOCALE_LOCK.lock().unwrap(); - #[rustfmt::skip] - const UTF8_LOCALES: &[&str] = &[ - "C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8", - ]; - let old_msg_locale: CString = { let old = unsafe { libc::setlocale(libc::LC_MESSAGES, ptr::null()) }; assert_ne!(old, ptr::null_mut()); @@ -541,33 +537,6 @@ fn init_locale(vars: &EnvStack) { } }; - // Try to get a multibyte-capable encoding. - // A "C" locale is broken for our purposes: any wchar function will break on it. So we try - // *really, really, really hard* to not have one. - let fix_locale = vars - .get_unless_empty(L!("fish_allow_singlebyte_locale")) - .map(|v| v.as_string()) - .map(|allow_c| !crate::wcstringutil::bool_from_string(&allow_c)) - .unwrap_or(true); - - if fix_locale && !probe_is_multibyte_locale() { - FLOG!(env_locale, "Have single byte locale, trying to fix."); - let mut fixed = false; - for locale in UTF8_LOCALES { - let locale_cstr = CString::new(*locale).unwrap(); - // this can fail, that is fine - unsafe { libc::setlocale(libc::LC_CTYPE, locale_cstr.as_ptr()) }; - if probe_is_multibyte_locale() { - FLOG!(env_locale, "Fixed locale:", locale); - fixed = true; - break; - } - } - if !fixed { - FLOG!(env_locale, "Failed to fix locale."); - } - } - // We *always* use a C-locale for numbers because we want '.' (except for in printf). let loc_ptr = unsafe { libc::setlocale(libc::LC_NUMERIC, c"C".as_ptr().cast()) }; // should never fail, the C locale should always be defined diff --git a/src/input_common.rs b/src/input_common.rs index f437920b5..3ab1cf1ec 100644 --- a/src/input_common.rs +++ b/src/input_common.rs @@ -1,6 +1,6 @@ use crate::common::{ - WSL, bytes2wcstring, fish_reserved_codepoint, get_is_multibyte_locale, - is_windows_subsystem_for_linux, read_blocked, shell_modes, + WSL, bytes2wcstring, fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked, + shell_modes, }; use crate::env::{EnvStack, Environment}; use crate::fd_readable_set::{FdReadableSet, Timeout}; @@ -17,7 +17,6 @@ }; use crate::universal_notifier::default_notifier; use crate::wchar::{encode_byte_to_char, prelude::*}; -use crate::wutil::encoding::{mbrtowc, mbstate_t, zero_mbstate}; use crate::wutil::{fish_is_pua, fish_wcstol}; use std::cell::{RefCell, RefMut}; use std::collections::VecDeque; @@ -826,9 +825,7 @@ fn try_pop(&mut self) -> Option { self.get_input_data_mut().queue.pop_front() } - /// Function used by [`readch`](Self::readch) to read bytes from stdin until enough bytes have been read to - /// convert them to a wchar_t. Conversion is done using mbrtowc. If a character has previously - /// been read and then 'unread' using \c input_common_unreadch, that character is returned. + /// Read the next event, such as a UTF-8-encoded codepoint. fn readch(&mut self) -> CharEvent { loop { // Do we have something enqueued already? @@ -871,7 +868,7 @@ fn readch(&mut self) -> CharEvent { InputEventTrigger::Byte(read_byte) => { let mut have_escape_prefix = false; let mut buffer = vec![read_byte]; - let key_with_escape = if read_byte == 0x1b { + let mut key = if read_byte == 0x1b { self.parse_escape_sequence(&mut buffer, &mut have_escape_prefix) } else { canonicalize_control_char(read_byte).map(KeyEvent::from) @@ -883,47 +880,35 @@ fn readch(&mut self) -> CharEvent { continue; } let mut seq = WString::new(); - let mut key = key_with_escape; if key.is_some_and(|key| key.key == Key::from_raw(key::Invalid)) { continue; } assert!(key.is_none_or(|key| key.codepoint != key::Invalid)); - let mut consumed = 0; - let mut state = zero_mbstate(); - let mut i = 0; + // At this point, the bytes in `buffer` should be parsed as a UTF-8 sequence, + // or, if they are not valid UTF-8, ignored. On incomplete sequences, another + // byte is read and decoding is tried again in the next iteration. let ok = loop { - if i == buffer.len() { - buffer.push( - match next_input_event(self.get_in_fd(), Timeout::Forever) { - InputEventTrigger::Byte(b) => b, - _ => 0, - }, - ); - } - match decode_input_byte( - &mut seq, - InvalidPolicy::Error, - &mut state, - &buffer[..i + 1], - &mut consumed, - ) { - DecodeState::Incomplete => (), + match decode_one_codepoint_utf8(&mut seq, InvalidPolicy::Error, &buffer) { + DecodeState::Incomplete => { + buffer.push( + match next_input_event(self.get_in_fd(), Timeout::Forever) { + InputEventTrigger::Byte(b) => b, + _ => 0, + }, + ); + } DecodeState::Complete => { - if have_escape_prefix && i != 0 { - have_escape_prefix = false; + if have_escape_prefix { let c = seq.as_char_slice().last().unwrap(); key = Some(KeyEvent::from(alt(*c))); } - if i + 1 == buffer.len() { - break true; - } + break true; } DecodeState::Error => { self.push_front(CharEvent::from_check_exit()); break false; } } - i += 1; }; if !ok { continue; @@ -1686,63 +1671,37 @@ pub(crate) enum InvalidPolicy { Passthrough, } -pub(crate) fn decode_input_byte( +pub(crate) fn decode_one_codepoint_utf8( out_seq: &mut WString, invalid_policy: InvalidPolicy, - state: &mut mbstate_t, buffer: &[u8], - consumed: &mut usize, ) -> DecodeState { use DecodeState::*; - let mut res: char = '\0'; - let read_byte = *buffer.last().unwrap(); - if !get_is_multibyte_locale() { - // single-byte locale, all values are legal - res = read_byte.into(); - out_seq.push(res); - return Complete; - } - let mut invalid = |out_seq: &mut WString, log_error: fn()| match invalid_policy { - InvalidPolicy::Error => { - (log_error)(); - Error - } - InvalidPolicy::Passthrough => { - for &b in &buffer[*consumed..] { - out_seq.push(encode_byte_to_char(b)); + match std::str::from_utf8(buffer) { + Ok(parsed_str) => { + for c in parsed_str.chars() { + if !fish_reserved_codepoint(c) { + out_seq.push(c); + } } - *consumed = buffer.len(); Complete } - }; - let mut codepoint = u32::from(res); - match unsafe { - mbrtowc( - std::ptr::addr_of_mut!(codepoint), - std::ptr::addr_of!(read_byte).cast(), - 1, - state, - ) - } as isize - { - -1 => { - return invalid(out_seq, || FLOG!(reader, "Illegal input encoding")); - } - -2 => { - // Sequence not yet complete. - return Incomplete; - } - _ => (), + Err(e) => match e.error_len() { + Some(_) => match invalid_policy { + InvalidPolicy::Error => { + FLOG!(reader, "Illegal input encoding"); + Error + } + InvalidPolicy::Passthrough => { + for &b in buffer { + out_seq.push(encode_byte_to_char(b)); + } + Complete + } + }, + None => Incomplete, + }, } - if let Some(res) = char::from_u32(codepoint) { - // Sequence complete. - if !fish_reserved_codepoint(res) { - *consumed += 1; - out_seq.push(res); - return Complete; - } - } - invalid(out_seq, || FLOG!(reader, "Illegal codepoint")) } pub(crate) fn stop_query(mut query: RefMut<'_, Option>) -> bool { diff --git a/src/pager.rs b/src/pager.rs index 0407986e4..9cee13eff 100644 --- a/src/pager.rs +++ b/src/pager.rs @@ -5,7 +5,6 @@ use crate::common::{ EscapeFlags, EscapeStringStyle, escape_string, get_ellipsis_char, get_ellipsis_str, - get_is_multibyte_locale, }; use crate::complete::Completion; use crate::editable_line::EditableLine; @@ -1231,19 +1230,16 @@ fn process_completions_into_infos(lst: &[Completion]) -> Vec { EscapeFlags::NO_PRINTABLES | EscapeFlags::NO_QUOTED | EscapeFlags::SYMBOLIC, ), )); - if comp.replaces_line() - // HACK We want to render a full shell command, with syntax highlighting. Above we - // escape nonprintables, which might make the rendered command longer than the original - // completion. In that case we get wrong colors. However this should only happen in - // contrived cases, since our symbolic escaping uses a single character to represent - // newline and tab characters; other nonprintables are extremely rare in a command - // line. It will only be common for single-byte locales where we don't - // use Unicode characters for escaping, so just disable those here. - // We should probably fix this by first highlighting the original completion, and - // then writing a variant of escape_string() that adjusts highlighting according - // so it matches the escaped string. - && get_is_multibyte_locale() - { + // HACK We want to render a full shell command, with syntax highlighting. Above we + // escape nonprintables, which might make the rendered command longer than the original + // completion. In that case we get wrong colors. However this should only happen in + // contrived cases, since our symbolic escaping uses a single character to represent + // newline and tab characters; other nonprintables are extremely rare in a command + // line. + // We should probably fix this by first highlighting the original completion, and + // then writing a variant of escape_string() that adjusts highlighting according + // so it matches the escaped string. + if comp.replaces_line() { highlight_shell( &comp.completion, &mut comp_info.colors, diff --git a/src/reader.rs b/src/reader.rs index 409906b82..d311123cf 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -55,9 +55,8 @@ use crate::common::ScopeGuarding; use crate::common::{ EscapeFlags, EscapeStringStyle, PROGRAM_NAME, ScopeGuard, UTF8_BOM_WCHAR, bytes2wcstring, - escape, escape_string, exit_without_destructors, get_ellipsis_char, get_is_multibyte_locale, - get_obfuscation_read_char, restore_term_foreground_process_group_for_exit, shell_modes, - write_loop, + escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char, + restore_term_foreground_process_group_for_exit, shell_modes, write_loop, }; use crate::complete::{ CompleteFlags, Completion, CompletionList, CompletionRequestOptions, complete, complete_load, @@ -3220,14 +3219,7 @@ fn handle_readline_command(&mut self, c: ReadlineCmd) { self.history_pager = Some(0..1); // Update the pager data. self.pager.set_search_field_shown(true); - self.pager.set_prefix( - if get_is_multibyte_locale() { - L!("► ") - } else { - L!("> ") - }, - /*highlight=*/ false, - ); + self.pager.set_prefix(L!("► "), false); // Update the search field, which triggers the actual history search. let search_string = if !self.history_search.active() || self.history_search.search_string().is_empty() diff --git a/src/tests/string_escape.rs b/src/tests/string_escape.rs index 812efdb97..939761849 100644 --- a/src/tests/string_escape.rs +++ b/src/tests/string_escape.rs @@ -1,40 +1,11 @@ -use std::sync::MutexGuard; - use crate::common::{ ENCODE_DIRECT_BASE, ENCODE_DIRECT_END, EscapeFlags, EscapeStringStyle, UnescapeStringStyle, - bytes2wcstring, escape_string, fish_setlocale, unescape_string, wcs2bytes, + bytes2wcstring, escape_string, unescape_string, wcs2bytes, }; -use crate::locale::LOCALE_LOCK; use crate::util::{get_rng_seed, get_seeded_rng}; use crate::wchar::{L, WString, wstr}; -use crate::wutil::encoding::{ - AT_LEAST_MB_LEN_MAX, probe_is_multibyte_locale, wcrtomb, zero_mbstate, -}; use rand::{Rng, RngCore}; -/// wcs2bytes is locale-dependent, so ensure we have a multibyte locale -/// before using it in a test. -fn setlocale() -> MutexGuard<'static, ()> { - let guard = LOCALE_LOCK.lock().unwrap(); - - #[rustfmt::skip] - const UTF8_LOCALES: &[&str] = &[ - "C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8", - ]; - if probe_is_multibyte_locale() { - return guard; - } - for locale in UTF8_LOCALES { - let locale = std::ffi::CString::new(locale.to_owned()).unwrap(); - unsafe { libc::setlocale(libc::LC_CTYPE, locale.as_ptr()) }; - if probe_is_multibyte_locale() { - fish_setlocale(); // Update cached locale information. - return guard; - } - } - panic!("No UTF-8 locale found"); -} - #[test] fn test_escape_string() { let regex = |input| escape_string(input, EscapeStringStyle::Regex); @@ -105,7 +76,6 @@ fn test_escape_var() { } fn escape_test(escape_style: EscapeStringStyle, unescape_style: UnescapeStringStyle) { - let _locale_guard = setlocale(); let seed: u128 = 92348567983274852905629743984572; let mut rng = get_seeded_rng(seed); @@ -185,7 +155,6 @@ fn bytes2hex(input: &[u8]) -> String { /// string comes back through double conversion. #[test] fn test_convert() { - let _locale_guard = setlocale(); let seed = get_rng_seed(); let mut rng = get_seeded_rng(seed); let mut origin = Vec::new(); @@ -241,30 +210,18 @@ fn test_convert_ascii() { } } -/// fish uses the private-use range to encode bytes that could not be decoded using the -/// user's locale. If the input could be decoded, but decoded to private-use codepoints, -/// then fish should also use the direct encoding for those bytes. Verify that characters -/// in the private use area are correctly round-tripped. See #7723. +/// fish uses the private-use range to encode bytes that are not valid UTF-8. +/// If the input decodes to these private-use codepoints, +/// then fish should also use the direct encoding for those bytes. +/// Verify that characters in the private use area are correctly round-tripped. See #7723. #[test] fn test_convert_private_use() { for c in ENCODE_DIRECT_BASE..ENCODE_DIRECT_END { - // Encode the char via the locale. Do not use fish functions which interpret these - // specially. - let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX]; - let mut state = zero_mbstate(); - let len = unsafe { - wcrtomb( - std::ptr::addr_of_mut!(converted[0]).cast(), - c as u32, - &mut state, - ) - }; - if len == 0_usize.wrapping_sub(1) { - // Could not be encoded in this locale. - continue; - } - let s = &converted[..len]; - + // A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8. + // TODO MSRV(1.92?) replace 4 by `char::MAX_LEN_UTF8` once that's available in our MSRV. + // https://doc.rust-lang.org/std/primitive.char.html#associatedconstant.MAX_LEN_UTF8 + let mut converted = [0_u8; 4]; + let s = c.encode_utf8(&mut converted).as_bytes(); // Ask fish to decode this via bytes2wcstring. // bytes2wcstring should notice that the decoded form collides with its private use // and encode it directly. diff --git a/src/wcstringutil.rs b/src/wcstringutil.rs index 808b8abd2..587deda93 100644 --- a/src/wcstringutil.rs +++ b/src/wcstringutil.rs @@ -1,10 +1,8 @@ //! Helper functions for working with wcstring. -use crate::common::{get_ellipsis_char, get_ellipsis_str, get_is_multibyte_locale}; +use crate::common::{get_ellipsis_char, get_ellipsis_str}; use crate::fallback::{fish_wcwidth, wcscasecmp, wcscasecmp_fuzzy}; -use crate::flog::FLOGF; use crate::wchar::{decode_byte_from_char, prelude::*}; -use crate::wutil::encoding::{AT_LEAST_MB_LEN_MAX, wcrtomb, zero_mbstate}; /// Return the number of newlines in a string. pub fn count_newlines(s: &wstr) -> usize { @@ -299,56 +297,28 @@ pub fn string_fuzzy_match_string( } /// Implementation of wcs2bytes that accepts a callback. -/// This invokes `func` with (const char*, size_t) pairs. +/// This invokes `func` with byte slices containing the UTF-8 encoding of the characters in the +/// input, doing one invocation per character. /// If `func` returns false, it stops; otherwise it continues. /// Return false if the callback returned false, otherwise true. pub fn wcs2bytes_callback(input: &wstr, mut func: impl FnMut(&[u8]) -> bool) -> bool { - let mut state = zero_mbstate(); - let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX]; - - let is_singlebyte_locale = !get_is_multibyte_locale(); + // A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8. + let mut converted = [0_u8; 4]; for c in input.chars() { - if let Some(byte) = decode_byte_from_char(c) { + let bytes = if let Some(byte) = decode_byte_from_char(c) { converted[0] = byte; - if !func(&converted[..1]) { - return false; - } - } else if is_singlebyte_locale { - // single-byte locale (C/POSIX/ISO-8859) - // If `c` contains a wide character we emit a question-mark. - converted[0] = u8::try_from(u32::from(c)).unwrap_or(b'?'); - if !func(&converted[..1]) { - return false; - } + &converted[..=0] } else { - converted = [0; AT_LEAST_MB_LEN_MAX]; - let len = unsafe { - wcrtomb( - std::ptr::addr_of_mut!(converted[0]).cast(), - c as u32, - &mut state, - ) - }; - if len == 0_usize.wrapping_sub(1) { - wcs2bytes_bad_char(c); - state = zero_mbstate(); - } else if !func(&converted[..len]) { - return false; - } + c.encode_utf8(&mut converted).as_bytes() + }; + if !func(bytes) { + return false; } } true } -fn wcs2bytes_bad_char(c: char) { - FLOGF!( - char_encoding, - L!("Wide character U+%4X has no narrow representation"), - c - ); -} - /// Split a string by runs of any of the separator characters provided in `seps`. /// Note the delimiters are the characters in `seps`, not `seps` itself. /// `seps` may contain the NUL character. diff --git a/src/wutil/encoding.rs b/src/wutil/encoding.rs deleted file mode 100644 index eb6db56a6..000000000 --- a/src/wutil/encoding.rs +++ /dev/null @@ -1,50 +0,0 @@ -extern "C" { - #[cfg_attr(cygwin, link_name = "c32rtomb")] - pub fn wcrtomb(s: *mut libc::c_char, wc: u32, ps: *mut mbstate_t) -> usize; - #[cfg_attr(cygwin, link_name = "mbrtoc32")] - pub fn mbrtowc(pwc: *mut u32, s: *const libc::c_char, n: usize, p: *mut mbstate_t) -> usize; -} - -// HACK This should be mbstate_t from libc but that's not exposed. Since it's only written by -// libc, we define it as opaque type that should be large enough for all implementations. -pub type mbstate_t = [u64; 16]; - -#[inline] -pub fn zero_mbstate() -> mbstate_t { - [0; 16] -} - -// HACK This should be the MB_LEN_MAX macro from libc but that's not easy to get. -pub const AT_LEAST_MB_LEN_MAX: usize = 32; - -/// Return true if we believe we are in a multibyte locale. -/// Note this reads the current locale and is modestly expensive - prefer the cached -/// values in `common.rs` which is set by `fish_setlocale`. -pub fn probe_is_multibyte_locale() -> bool { - // In general we would like to read MB_CUR_MAX, but that is not exposed by Rust libc. - // Instead, check if mbrtowc for any byte in the range 0-255 returns (size_t)(-2) which indicates - // the presence of a multibyte locale. - #[inline] - fn is_mb_lead(b: u8) -> bool { - let mut st = zero_mbstate(); - let mut wc: libc::wchar_t = 0; - let c = b as libc::c_char; - let n = unsafe { - mbrtowc( - std::ptr::addr_of_mut!(wc).cast::(), - std::ptr::addr_of!(c), - 1, - std::ptr::addr_of_mut!(st), - ) - }; - n == (-2_i64 as libc::size_t) - } - - // Fast path: check common lead bytes. - if is_mb_lead(0xE0) || is_mb_lead(0xC2) { - return true; - } - - // Scan non-ASCII high bytes. - (0x80_u8..=0xFF).any(is_mb_lead) -} diff --git a/src/wutil/mod.rs b/src/wutil/mod.rs index 565268f63..85b4706bb 100644 --- a/src/wutil/mod.rs +++ b/src/wutil/mod.rs @@ -1,5 +1,4 @@ pub mod dir_iter; -pub mod encoding; pub mod errors; pub mod fileid; pub mod gettext; diff --git a/tests/checks/locale.fish b/tests/checks/locale.fish index 7b8d0d526..e8f233c03 100644 --- a/tests/checks/locale.fish +++ b/tests/checks/locale.fish @@ -3,10 +3,6 @@ # see #7934. #REQUIRES: test -z "$GITHUB_WORKFLOW" -# We typically try to force a utf8-capable locale, -# this turns that off. -set -gx fish_allow_singlebyte_locale 1 - # A function to display bytes, necessary because GNU and BSD implementations of `od` have different output. # We used to use xxd, but it's not available everywhere. See #3797. # @@ -22,73 +18,12 @@ echo -n A\u00FCA | display_bytes #CHECK: 0000000 101 303 274 101 #CHECK: 0000004 -# Verify that exporting a change to the C locale produces the expected output. -# The output should include the literal byte \xFC rather than the UTF-8 sequence for \u00FC. -begin - set -lx LC_ALL C - echo -n B\u00FCB | display_bytes -end -#CHECK: 0000000 102 374 102 -#CHECK: 0000003 - # Since the previous change was localized to a block it should no # longer be in effect and we should be back to a UTF-8 locale. echo -n C\u00FCC | display_bytes #CHECK: 0000000 103 303 274 103 #CHECK: 0000004 -# Verify that setting a non-exported locale var doesn't affect the behavior. -# The output should include the UTF-8 sequence for \u00FC rather than that literal byte. -# Just like the previous test. -begin - set -l LC_ALL C - echo -n D\u00FCD | display_bytes -end -#CHECK: 0000000 104 303 274 104 -#CHECK: 0000004 - -# Verify that fish can pass through non-ASCII characters in the C/POSIX -# locale. This is to prevent regression of -# https://github.com/fish-shell/fish-shell/issues/2802. -# -# These tests are needed because the relevant standards allow the functions -# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid -# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid. -# Other libc implementations (e.g., BSD) treat them as valid. We want fish to -# always treat those bytes as valid. - -# The fish in the middle of the pipeline should be receiving a UTF-8 encoded -# version of the unicode from the echo. It should pass those bytes thru -# literally since it is in the C locale. We verify this by first passing the -# echo output directly to the `xxd` program then via a fish instance. The -# output should be "58c3bb58" for the first statement and "58c3bc58" for the -# second. -echo -n X\u00FBX | display_bytes -echo X\u00FCX | env LC_ALL=C $fish -c 'read foo; echo -n $foo' | display_bytes -#CHECK: 0000000 130 303 273 130 -#CHECK: 0000004 -#CHECK: 0000000 130 303 274 130 -#CHECK: 0000004 - -# The next tests deliberately spawn another fish instance to test inheritance of env vars. - -# This test is subtle. Despite the presence of the \u00fc unicode char (a "u" -# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to -# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the -# few single-byte unicode chars (that are not ASCII) are generally in the -# ISO 8859-x char sets which are encompassed by the C locale. The output should -# be "59fc59". -env LC_ALL=C $fish -c 'echo -n Y\u00FCY' | display_bytes -#CHECK: 0000000 131 374 131 -#CHECK: 0000003 - -# The user can specify a wide unicode character (one requiring more than a -# single byte). In the C/POSIX locales we substitute a question-mark for the -# unencodable wide char. The output should be "543f54". -env LC_ALL=C $fish -c 'echo -n T\u01FDT' | display_bytes -#CHECK: 0000000 124 077 124 -#CHECK: 0000003 - string match ö \Xc3\Xb6 #CHECK: ö