encoding: use UTF-8 everywhere

Assume that UTF-8 is used everywhere. This allows for significant simplification of encoding-related functionality. We no longer need branching on single-byte vs. multi-byte locales, and we can get rid of all the libc calls for encoding and decoding, replacing them with Rust's built-in functionality, or removing them without replacement in cases where their functionality is no longer needed. Several tests are removed from `tests/checks/locale.fish`, since setting the locale no longer impacts encoding behavior. We might want more rigorous testing of UTF-8 handling instead. Closes #11975
2026-06-03 23:11:14 -03:00 · 2025-10-18 23:09:10 +02:00
parent 755d5ae222
commit c8001b5023
12 changed files with 136 additions and 497 deletions
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -9,6 +9,8 @@ Notable improvements and fixes
 Deprecations and removed features
 ---------------------------------

+- Fish now assumes UTF-8 everywhere, regardless of locale settings. Input bytes which are not valid UTF-8 should still be round-tripped correctly.
+
 Interactive improvements
 ------------------------
 - :doc:`fish_config prompt {choose,save} <cmds/fish_config>` have been taught to reset :doc:`fish_mode_prompt <cmds/fish_mode_prompt>` in addition to the other prompt functions (:issue:`11937`).
--- a/src/builtins/read.rs
+++ b/src/builtins/read.rs
@@ -13,7 +13,7 @@
 use crate::env::{EnvVar, EnvVarFlags};
 use crate::input_common::DecodeState;
 use crate::input_common::InvalidPolicy;
-use crate::input_common::decode_input_byte;
+use crate::input_common::decode_one_codepoint_utf8;
 use crate::nix::isatty;
 use crate::reader::ReaderConfig;
 use crate::reader::commandline_set_buffer;
@@ -27,7 +27,6 @@
 use crate::wcstringutil::split_about;
 use crate::wcstringutil::split_string_tok;
 use crate::wutil;
-use crate::wutil::encoding::zero_mbstate;
 use crate::wutil::perror;
 use libc::SEEK_CUR;
 use std::num::NonZeroUsize;
@@ -389,8 +388,6 @@ fn read_one_char_at_a_time(
    let mut unconsumed = vec![];

    loop {
-        let mut state = zero_mbstate();
-
        let chars_read = buff.len();
        let res = loop {
            let mut b = [0_u8; 1];
@@ -400,17 +397,9 @@ fn read_one_char_at_a_time(
                }
                _ => {}
            }
-            let b = b[0];
-            unconsumed.push(b);
+            unconsumed.push(b[0]);
            nbytes += 1;
-            let mut consumed = 0;
-            match decode_input_byte(
-                buff,
-                InvalidPolicy::Passthrough,
-                &mut state,
-                &unconsumed,
-                &mut consumed,
-            ) {
+            match decode_one_codepoint_utf8(buff, InvalidPolicy::Passthrough, &unconsumed) {
                DecodeState::Incomplete => continue,
                DecodeState::Complete => {
                    unconsumed.clear();
--- a/src/common.rs
+++ b/src/common.rs
@@ -16,9 +16,6 @@
 use crate::wchar::{decode_byte_from_char, encode_byte_to_char, prelude::*};
 use crate::wcstringutil::wcs2bytes_callback;
 use crate::wildcard::{ANY_CHAR, ANY_STRING, ANY_STRING_RECURSIVE};
-use crate::wutil::encoding::{
-    AT_LEAST_MB_LEN_MAX, mbrtowc, probe_is_multibyte_locale, wcrtomb, zero_mbstate,
-};
 use crate::wutil::fish_iswalnum;
 use bitflags::bitflags;
 use libc::{SIG_IGN, SIGTTOU, STDIN_FILENO};
@@ -190,7 +187,7 @@ fn escape_string_script(input: &wstr, flags: EscapeFlags) -> WString {
    let no_quoted = flags.contains(EscapeFlags::NO_QUOTED);
    let no_tilde = flags.contains(EscapeFlags::NO_TILDE);
    let no_qmark = feature_test(FeatureFlag::qmark_noglob);
-    let symbolic = flags.contains(EscapeFlags::SYMBOLIC) && get_is_multibyte_locale();
+    let symbolic = flags.contains(EscapeFlags::SYMBOLIC);

    assert!(
        !symbolic || !escape_printables,
@@ -1035,19 +1032,15 @@ pub fn shell_modes() -> MutexGuard<'static, libc::termios> {
 /// The character to use where the text has been truncated. Is an ellipsis on unicode system and a $
 /// on other systems.
 pub fn get_ellipsis_char() -> char {
-    char::from_u32(ELLIPSIS_CHAR.load(Ordering::Relaxed)).unwrap()
+    '\u{2026}'
 }

-static ELLIPSIS_CHAR: AtomicU32 = AtomicU32::new(0);
-
 /// The character or string to use where text has been truncated (ellipsis if possible, otherwise
 /// ...)
 pub fn get_ellipsis_str() -> &'static wstr {
-    ELLIPSIS_STRING.load()
+    L!("\u{2026}")
 }

-static ELLIPSIS_STRING: AtomicRef<wstr> = AtomicRef::new(&L!(""));
-
 /// Character representing an omitted newline at the end of text.
 pub fn get_omitted_newline_str() -> &'static wstr {
    OMITTED_NEWLINE_STR.load()
@@ -1065,13 +1058,6 @@ pub fn get_obfuscation_read_char() -> char {
    char::from_u32(OBFUSCATION_READ_CHAR.load(Ordering::Relaxed)).unwrap()
 }

-static IS_MB_LOCALE: RelaxedAtomicBool = RelaxedAtomicBool::new(false);
-
-/// Whether we believe we are in a multibyte locale.
-pub fn get_is_multibyte_locale() -> bool {
-    IS_MB_LOCALE.load()
-}
-
 /// Profiling flag. True if commands should be profiled.
 pub static PROFILING_ACTIVE: RelaxedAtomicBool = RelaxedAtomicBool::new(false);

@@ -1102,84 +1088,54 @@ pub fn has_working_tty_timestamps() -> bool {
 /// todo!("Maybe remove the box? It is only needed for get_bg_context.")
 pub type CancelChecker = Box<dyn Fn() -> bool>;

-/// Converts the narrow character string \c in into its wide equivalent, and return it.
-///
-/// The string may contain embedded nulls.
-///
-/// This function encodes illegal character sequences in a reversible way using the private use
-/// area.
-pub fn bytes2wcstring(inp: &[u8]) -> WString {
-    if inp.is_empty() {
+/// Encodes the bytes in `input` into a [`WString`], encoding non-UTF-8 bytes into private-use-area
+/// code-points. Bytes which would be parsed into our reserved PUA range are encoded individually,
+/// to allow for correct round-tripping.
+pub fn bytes2wcstring(mut input: &[u8]) -> WString {
+    if input.is_empty() {
        return WString::new();
    }

    let mut result = WString::new();
-    result.reserve(inp.len());
-    let mut pos = 0;
-    let mut state = zero_mbstate();
-    while pos < inp.len() {
-        // Append any initial sequence of ascii characters.
-        // Note we do not support character sets which are not supersets of ASCII.
-        let ascii_prefix_length = count_ascii_prefix(&inp[pos..]);
-        result.push_str(std::str::from_utf8(&inp[pos..pos + ascii_prefix_length]).unwrap());
-        pos += ascii_prefix_length;
-        assert!(pos <= inp.len(), "Position overflowed length");
-        if pos == inp.len() {
-            break;
-        }

-        // We have found a non-ASCII character.
-        let mut ret = 0;
-        let mut c = '\0';
-
-        let use_encode_direct = if inp[pos] & 0xF8 == 0xF8 {
-            // Protect against broken mbrtowc() implementations which attempt to encode UTF-8
-            // sequences longer than four bytes (e.g., OS X Snow Leopard).
-            // TODO This check used to be conditionally compiled only on affected platforms.
-            true
-        } else {
-            let mut codepoint = u32::from(c);
-            ret = unsafe {
-                mbrtowc(
-                    std::ptr::addr_of_mut!(codepoint),
-                    std::ptr::addr_of!(inp[pos]).cast(),
-                    inp.len() - pos,
-                    &mut state,
-                )
-            };
-            match char::from_u32(codepoint) {
-                Some(codepoint) => {
-                    c = codepoint;
-                    // Determine whether to encode this character with our crazy scheme.
-                    fish_reserved_codepoint(c)
-                    ||
-                    // Incomplete sequence.
-                    ret == 0_usize.wrapping_sub(2)
-                    ||
-                    // Invalid data.
-                    ret == 0_usize.wrapping_sub(1)
-                    ||
-                    // Other error codes? Terrifying, should never happen.
-                    ret > inp.len() - pos
+    fn append_escaped_str(output: &mut WString, input: &str) {
+        for (i, c) in input.char_indices() {
+            if fish_reserved_codepoint(c) {
+                for byte in &input.as_bytes()[i..i + c.len_utf8()] {
+                    output.push(encode_byte_to_char(*byte));
                }
-                None => true,
+            } else {
+                output.push(c);
            }
-        };
+        }
+    }

-        if use_encode_direct {
-            c = encode_byte_to_char(inp[pos]);
-            result.push(c);
-            pos += 1;
-            state = zero_mbstate();
-        } else if ret == 0 {
-            // embedded null byte!
-            result.push('\0');
-            pos += 1;
-            state = zero_mbstate();
-        } else {
-            // normal case
-            result.push(c);
-            pos += ret;
+    while !input.is_empty() {
+        match std::str::from_utf8(input) {
+            Ok(parsed_str) => {
+                append_escaped_str(&mut result, parsed_str);
+                // The entire remaining input could be parsed, so we are done.
+                break;
+            }
+            Err(e) => {
+                let (valid, after_valid) = input.split_at(e.valid_up_to());
+                // SAFETY: The previous `str::from_utf8` call established that the prefix `valid`
+                // is valid UTF-8. This prefix may be empty.
+                let parsed_str = unsafe { std::str::from_utf8_unchecked(valid) };
+                append_escaped_str(&mut result, parsed_str);
+                // The length of the prefix of `after_valid` which is invalid UTF-8.
+                // The remaining bytes of `input` (if any) will be parsed in subsequent iterations
+                // of the loop, starting from the first byte that starts a valid UTF-8-encoded codepoint.
+                // `error_len` can return `None`, if it sees a byte sequence that could be the
+                // prefix of a valid code-point encoding at the end of the byte slice.
+                // This is useful when the input is chunked, but we don't do that, so in this case
+                // we use our custom encoding for all remaining bytes (at most 3).
+                let error_len = e.error_len().unwrap_or(after_valid.len());
+                for byte in &after_valid[..error_len] {
+                    result.push(encode_byte_to_char(*byte));
+                }
+                input = &after_valid[error_len..];
+            }
        }
    }
    result
@@ -1265,12 +1221,6 @@ pub fn wcs2bytes_appending(output: &mut Vec<u8>, input: &wstr) {
    });
 }

-/// Return the count of initial characters in `in` which are ASCII.
-fn count_ascii_prefix(inp: &[u8]) -> usize {
-    // The C++ version had manual vectorization.
-    inp.iter().take_while(|c| c.is_ascii()).count()
-}
-
 // Check if we are running in the test mode, where we should suppress error output
 pub const TESTS_PROGRAM_NAME: &wstr = L!("(ignore)");

@@ -1302,22 +1252,6 @@ macro_rules! LL {
        }};
    }

-    // Mark if we are a multibyte locale.
-    IS_MB_LOCALE.store(probe_is_multibyte_locale());
-
-    // Use various Unicode symbols if they can be encoded using the current locale, else a simple
-    // ASCII char alternative. All of the can_be_encoded() invocations should return the same
-    // true/false value since the code points are in the BMP but we're going to be paranoid. This
-    // is also technically wrong if we're not in a Unicode locale but we expect (or hope)
-    // can_be_encoded() will return false in that case.
-    if can_be_encoded('\u{2026}') {
-        ELLIPSIS_CHAR.store(u32::from('\u{2026}'), Ordering::Relaxed);
-        ELLIPSIS_STRING.store(LL!("\u{2026}"));
-    } else {
-        ELLIPSIS_CHAR.store(u32::from('$'), Ordering::Relaxed); // "horizontal ellipsis"
-        ELLIPSIS_STRING.store(LL!("..."));
-    }
-
    if is_windows_subsystem_for_linux(WSL::Any) {
        // neither of \u23CE and \u25CF can be displayed in the default fonts on Windows, though
        // they can be *encoded* just fine. Use alternative glyphs.
@@ -1327,29 +1261,16 @@ macro_rules! LL {
        OMITTED_NEWLINE_STR.store(LL!("^J"));
        OBFUSCATION_READ_CHAR.store(u32::from('*'), Ordering::Relaxed);
    } else {
-        if can_be_encoded('\u{23CE}') {
-            OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎)
-        } else {
-            OMITTED_NEWLINE_STR.store(LL!("^J"));
-        }
+        OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎)
        OBFUSCATION_READ_CHAR.store(
-            u32::from(if can_be_encoded('\u{25CF}') {
-                '\u{25CF}' // "black circle"
-            } else {
-                '#'
-            }),
+            u32::from(
+                '\u{25CF}', // "black circle"
+            ),
            Ordering::Relaxed,
        );
    }
 }

-/// Test if the character can be encoded using the current locale.
-fn can_be_encoded(wc: char) -> bool {
-    let mut converted = [0 as libc::c_char; AT_LEAST_MB_LEN_MAX];
-    let mut state = zero_mbstate();
-    unsafe { wcrtomb(converted.as_mut_ptr(), wc as u32, &mut state) != 0_usize.wrapping_sub(1) }
-}
-
 /// Call read, blocking and repeating on EINTR. Exits on EAGAIN.
 /// Return the number of bytes read, or 0 on EOF, or an error.
 pub fn read_blocked(fd: RawFd, buf: &mut [u8]) -> nix::Result<usize> {
--- a/src/env_dispatch.rs
+++ b/src/env_dispatch.rs
@@ -14,7 +14,6 @@
 use crate::terminal::use_terminfo;
 use crate::tty_handoff::xtversion;
 use crate::wchar::prelude::*;
-use crate::wutil::encoding::probe_is_multibyte_locale;
 use crate::wutil::fish_wcstoi;
 use crate::{function, terminal};
 use std::borrow::Cow;
@@ -25,12 +24,16 @@

 /// List of all locale environment variable names that might trigger (re)initializing of the locale
 /// subsystem. These are only the variables we're possibly interested in.
-#[rustfmt::skip]
-const LOCALE_VARIABLES: [&wstr; 10] = [
-    L!("LANG"),       L!("LANGUAGE"), L!("LC_ALL"),
-    L!("LC_COLLATE"), L!("LC_CTYPE"), L!("LC_MESSAGES"),
-    L!("LC_NUMERIC"), L!("LC_TIME"),  L!("LOCPATH"),
-    L!("fish_allow_singlebyte_locale"),
+const LOCALE_VARIABLES: [&wstr; 9] = [
+    L!("LANG"),
+    L!("LANGUAGE"),
+    L!("LC_ALL"),
+    L!("LC_COLLATE"),
+    L!("LC_CTYPE"),
+    L!("LC_MESSAGES"),
+    L!("LC_NUMERIC"),
+    L!("LC_TIME"),
+    L!("LOCPATH"),
 ];

 #[rustfmt::skip]
@@ -299,8 +302,6 @@ fn handle_tz_change(var_name: &wstr, vars: &EnvStack) {

 fn handle_locale_change(vars: &EnvStack) {
    init_locale(vars);
-    // We need to re-guess emoji width because the locale might have changed to a multibyte one.
-    guess_emoji_width(vars);
 }

 fn handle_term_change(vars: &EnvStack) {
@@ -502,11 +503,6 @@ pub fn read_terminfo_database(vars: &EnvStack) {
 fn init_locale(vars: &EnvStack) {
    let _guard = crate::locale::LOCALE_LOCK.lock().unwrap();

-    #[rustfmt::skip]
-    const UTF8_LOCALES: &[&str] = &[
-        "C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8",
-    ];
-
    let old_msg_locale: CString = {
        let old = unsafe { libc::setlocale(libc::LC_MESSAGES, ptr::null()) };
        assert_ne!(old, ptr::null_mut());
@@ -541,33 +537,6 @@ fn init_locale(vars: &EnvStack) {
        }
    };

-    // Try to get a multibyte-capable encoding.
-    // A "C" locale is broken for our purposes: any wchar function will break on it. So we try
-    // *really, really, really hard* to not have one.
-    let fix_locale = vars
-        .get_unless_empty(L!("fish_allow_singlebyte_locale"))
-        .map(|v| v.as_string())
-        .map(|allow_c| !crate::wcstringutil::bool_from_string(&allow_c))
-        .unwrap_or(true);
-
-    if fix_locale && !probe_is_multibyte_locale() {
-        FLOG!(env_locale, "Have single byte locale, trying to fix.");
-        let mut fixed = false;
-        for locale in UTF8_LOCALES {
-            let locale_cstr = CString::new(*locale).unwrap();
-            // this can fail, that is fine
-            unsafe { libc::setlocale(libc::LC_CTYPE, locale_cstr.as_ptr()) };
-            if probe_is_multibyte_locale() {
-                FLOG!(env_locale, "Fixed locale:", locale);
-                fixed = true;
-                break;
-            }
-        }
-        if !fixed {
-            FLOG!(env_locale, "Failed to fix locale.");
-        }
-    }
-
    // We *always* use a C-locale for numbers because we want '.' (except for in printf).
    let loc_ptr = unsafe { libc::setlocale(libc::LC_NUMERIC, c"C".as_ptr().cast()) };
    // should never fail, the C locale should always be defined
--- a/src/input_common.rs
+++ b/src/input_common.rs
@@ -1,6 +1,6 @@
 use crate::common::{
-    WSL, bytes2wcstring, fish_reserved_codepoint, get_is_multibyte_locale,
-    is_windows_subsystem_for_linux, read_blocked, shell_modes,
+    WSL, bytes2wcstring, fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked,
+    shell_modes,
 };
 use crate::env::{EnvStack, Environment};
 use crate::fd_readable_set::{FdReadableSet, Timeout};
@@ -17,7 +17,6 @@
 };
 use crate::universal_notifier::default_notifier;
 use crate::wchar::{encode_byte_to_char, prelude::*};
-use crate::wutil::encoding::{mbrtowc, mbstate_t, zero_mbstate};
 use crate::wutil::{fish_is_pua, fish_wcstol};
 use std::cell::{RefCell, RefMut};
 use std::collections::VecDeque;
@@ -826,9 +825,7 @@ fn try_pop(&mut self) -> Option<CharEvent> {
        self.get_input_data_mut().queue.pop_front()
    }

-    /// Function used by [`readch`](Self::readch) to read bytes from stdin until enough bytes have been read to
-    /// convert them to a wchar_t. Conversion is done using mbrtowc. If a character has previously
-    /// been read and then 'unread' using \c input_common_unreadch, that character is returned.
+    /// Read the next event, such as a UTF-8-encoded codepoint.
    fn readch(&mut self) -> CharEvent {
        loop {
            // Do we have something enqueued already?
@@ -871,7 +868,7 @@ fn readch(&mut self) -> CharEvent {
                InputEventTrigger::Byte(read_byte) => {
                    let mut have_escape_prefix = false;
                    let mut buffer = vec![read_byte];
-                    let key_with_escape = if read_byte == 0x1b {
+                    let mut key = if read_byte == 0x1b {
                        self.parse_escape_sequence(&mut buffer, &mut have_escape_prefix)
                    } else {
                        canonicalize_control_char(read_byte).map(KeyEvent::from)
@@ -883,47 +880,35 @@ fn readch(&mut self) -> CharEvent {
                        continue;
                    }
                    let mut seq = WString::new();
-                    let mut key = key_with_escape;
                    if key.is_some_and(|key| key.key == Key::from_raw(key::Invalid)) {
                        continue;
                    }
                    assert!(key.is_none_or(|key| key.codepoint != key::Invalid));
-                    let mut consumed = 0;
-                    let mut state = zero_mbstate();
-                    let mut i = 0;
+                    // At this point, the bytes in `buffer` should be parsed as a UTF-8 sequence,
+                    // or, if they are not valid UTF-8, ignored. On incomplete sequences, another
+                    // byte is read and decoding is tried again in the next iteration.
                    let ok = loop {
-                        if i == buffer.len() {
-                            buffer.push(
-                                match next_input_event(self.get_in_fd(), Timeout::Forever) {
-                                    InputEventTrigger::Byte(b) => b,
-                                    _ => 0,
-                                },
-                            );
-                        }
-                        match decode_input_byte(
-                            &mut seq,
-                            InvalidPolicy::Error,
-                            &mut state,
-                            &buffer[..i + 1],
-                            &mut consumed,
-                        ) {
-                            DecodeState::Incomplete => (),
+                        match decode_one_codepoint_utf8(&mut seq, InvalidPolicy::Error, &buffer) {
+                            DecodeState::Incomplete => {
+                                buffer.push(
+                                    match next_input_event(self.get_in_fd(), Timeout::Forever) {
+                                        InputEventTrigger::Byte(b) => b,
+                                        _ => 0,
+                                    },
+                                );
+                            }
                            DecodeState::Complete => {
-                                if have_escape_prefix && i != 0 {
-                                    have_escape_prefix = false;
+                                if have_escape_prefix {
                                    let c = seq.as_char_slice().last().unwrap();
                                    key = Some(KeyEvent::from(alt(*c)));
                                }
-                                if i + 1 == buffer.len() {
-                                    break true;
-                                }
+                                break true;
                            }
                            DecodeState::Error => {
                                self.push_front(CharEvent::from_check_exit());
                                break false;
                            }
                        }
-                        i += 1;
                    };
                    if !ok {
                        continue;
@@ -1686,63 +1671,37 @@ pub(crate) enum InvalidPolicy {
    Passthrough,
 }

-pub(crate) fn decode_input_byte(
+pub(crate) fn decode_one_codepoint_utf8(
    out_seq: &mut WString,
    invalid_policy: InvalidPolicy,
-    state: &mut mbstate_t,
    buffer: &[u8],
-    consumed: &mut usize,
 ) -> DecodeState {
    use DecodeState::*;
-    let mut res: char = '\0';
-    let read_byte = *buffer.last().unwrap();
-    if !get_is_multibyte_locale() {
-        // single-byte locale, all values are legal
-        res = read_byte.into();
-        out_seq.push(res);
-        return Complete;
-    }
-    let mut invalid = |out_seq: &mut WString, log_error: fn()| match invalid_policy {
-        InvalidPolicy::Error => {
-            (log_error)();
-            Error
-        }
-        InvalidPolicy::Passthrough => {
-            for &b in &buffer[*consumed..] {
-                out_seq.push(encode_byte_to_char(b));
+    match std::str::from_utf8(buffer) {
+        Ok(parsed_str) => {
+            for c in parsed_str.chars() {
+                if !fish_reserved_codepoint(c) {
+                    out_seq.push(c);
+                }
            }
-            *consumed = buffer.len();
            Complete
        }
-    };
-    let mut codepoint = u32::from(res);
-    match unsafe {
-        mbrtowc(
-            std::ptr::addr_of_mut!(codepoint),
-            std::ptr::addr_of!(read_byte).cast(),
-            1,
-            state,
-        )
-    } as isize
-    {
-        -1 => {
-            return invalid(out_seq, || FLOG!(reader, "Illegal input encoding"));
-        }
-        -2 => {
-            // Sequence not yet complete.
-            return Incomplete;
-        }
-        _ => (),
+        Err(e) => match e.error_len() {
+            Some(_) => match invalid_policy {
+                InvalidPolicy::Error => {
+                    FLOG!(reader, "Illegal input encoding");
+                    Error
+                }
+                InvalidPolicy::Passthrough => {
+                    for &b in buffer {
+                        out_seq.push(encode_byte_to_char(b));
+                    }
+                    Complete
+                }
+            },
+            None => Incomplete,
+        },
    }
-    if let Some(res) = char::from_u32(codepoint) {
-        // Sequence complete.
-        if !fish_reserved_codepoint(res) {
-            *consumed += 1;
-            out_seq.push(res);
-            return Complete;
-        }
-    }
-    invalid(out_seq, || FLOG!(reader, "Illegal codepoint"))
 }

 pub(crate) fn stop_query(mut query: RefMut<'_, Option<TerminalQuery>>) -> bool {
--- a/src/pager.rs
+++ b/src/pager.rs
@@ -5,7 +5,6 @@

 use crate::common::{
    EscapeFlags, EscapeStringStyle, escape_string, get_ellipsis_char, get_ellipsis_str,
-    get_is_multibyte_locale,
 };
 use crate::complete::Completion;
 use crate::editable_line::EditableLine;
@@ -1231,19 +1230,16 @@ fn process_completions_into_infos(lst: &[Completion]) -> Vec<PagerComp> {
                EscapeFlags::NO_PRINTABLES | EscapeFlags::NO_QUOTED | EscapeFlags::SYMBOLIC,
            ),
        ));
-        if comp.replaces_line()
-            // HACK We want to render a full shell command, with syntax highlighting.  Above we
-            // escape nonprintables, which might make the rendered command longer than the original
-            // completion. In that case we get wrong colors.  However this should only happen in
-            // contrived cases, since our symbolic escaping uses a single character to represent
-            // newline and tab characters; other nonprintables are extremely rare in a command
-            // line. It will only be common for single-byte locales where we don't
-            // use Unicode characters for escaping, so just disable those here.
-            // We should probably fix this by first highlighting the original completion, and
-            // then writing a variant of escape_string() that adjusts highlighting according
-            // so it matches the escaped string.
-            && get_is_multibyte_locale()
-        {
+        // HACK We want to render a full shell command, with syntax highlighting.  Above we
+        // escape nonprintables, which might make the rendered command longer than the original
+        // completion. In that case we get wrong colors.  However this should only happen in
+        // contrived cases, since our symbolic escaping uses a single character to represent
+        // newline and tab characters; other nonprintables are extremely rare in a command
+        // line.
+        // We should probably fix this by first highlighting the original completion, and
+        // then writing a variant of escape_string() that adjusts highlighting according
+        // so it matches the escaped string.
+        if comp.replaces_line() {
            highlight_shell(
                &comp.completion,
                &mut comp_info.colors,
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -55,9 +55,8 @@
 use crate::common::ScopeGuarding;
 use crate::common::{
    EscapeFlags, EscapeStringStyle, PROGRAM_NAME, ScopeGuard, UTF8_BOM_WCHAR, bytes2wcstring,
-    escape, escape_string, exit_without_destructors, get_ellipsis_char, get_is_multibyte_locale,
-    get_obfuscation_read_char, restore_term_foreground_process_group_for_exit, shell_modes,
-    write_loop,
+    escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char,
+    restore_term_foreground_process_group_for_exit, shell_modes, write_loop,
 };
 use crate::complete::{
    CompleteFlags, Completion, CompletionList, CompletionRequestOptions, complete, complete_load,
@@ -3220,14 +3219,7 @@ fn handle_readline_command(&mut self, c: ReadlineCmd) {
                self.history_pager = Some(0..1);
                // Update the pager data.
                self.pager.set_search_field_shown(true);
-                self.pager.set_prefix(
-                    if get_is_multibyte_locale() {
-                        L!("► ")
-                    } else {
-                        L!("> ")
-                    },
-                    /*highlight=*/ false,
-                );
+                self.pager.set_prefix(L!("► "), false);
                // Update the search field, which triggers the actual history search.
                let search_string = if !self.history_search.active()
                    || self.history_search.search_string().is_empty()
--- a/src/tests/string_escape.rs
+++ b/src/tests/string_escape.rs
@@ -1,40 +1,11 @@
-use std::sync::MutexGuard;
-
 use crate::common::{
    ENCODE_DIRECT_BASE, ENCODE_DIRECT_END, EscapeFlags, EscapeStringStyle, UnescapeStringStyle,
-    bytes2wcstring, escape_string, fish_setlocale, unescape_string, wcs2bytes,
+    bytes2wcstring, escape_string, unescape_string, wcs2bytes,
 };
-use crate::locale::LOCALE_LOCK;
 use crate::util::{get_rng_seed, get_seeded_rng};
 use crate::wchar::{L, WString, wstr};
-use crate::wutil::encoding::{
-    AT_LEAST_MB_LEN_MAX, probe_is_multibyte_locale, wcrtomb, zero_mbstate,
-};
 use rand::{Rng, RngCore};

-/// wcs2bytes is locale-dependent, so ensure we have a multibyte locale
-/// before using it in a test.
-fn setlocale() -> MutexGuard<'static, ()> {
-    let guard = LOCALE_LOCK.lock().unwrap();
-
-    #[rustfmt::skip]
-    const UTF8_LOCALES: &[&str] = &[
-        "C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8",
-    ];
-    if probe_is_multibyte_locale() {
-        return guard;
-    }
-    for locale in UTF8_LOCALES {
-        let locale = std::ffi::CString::new(locale.to_owned()).unwrap();
-        unsafe { libc::setlocale(libc::LC_CTYPE, locale.as_ptr()) };
-        if probe_is_multibyte_locale() {
-            fish_setlocale(); // Update cached locale information.
-            return guard;
-        }
-    }
-    panic!("No UTF-8 locale found");
-}
-
 #[test]
 fn test_escape_string() {
    let regex = |input| escape_string(input, EscapeStringStyle::Regex);
@@ -105,7 +76,6 @@ fn test_escape_var() {
 }

 fn escape_test(escape_style: EscapeStringStyle, unescape_style: UnescapeStringStyle) {
-    let _locale_guard = setlocale();
    let seed: u128 = 92348567983274852905629743984572;
    let mut rng = get_seeded_rng(seed);

@@ -185,7 +155,6 @@ fn bytes2hex(input: &[u8]) -> String {
 /// string comes back through double conversion.
 #[test]
 fn test_convert() {
-    let _locale_guard = setlocale();
    let seed = get_rng_seed();
    let mut rng = get_seeded_rng(seed);
    let mut origin = Vec::new();
@@ -241,30 +210,18 @@ fn test_convert_ascii() {
    }
 }

-/// fish uses the private-use range to encode bytes that could not be decoded using the
-/// user's locale. If the input could be decoded, but decoded to private-use codepoints,
-/// then fish should also use the direct encoding for those bytes. Verify that characters
-/// in the private use area are correctly round-tripped. See #7723.
+/// fish uses the private-use range to encode bytes that are not valid UTF-8.
+/// If the input decodes to these private-use codepoints,
+/// then fish should also use the direct encoding for those bytes.
+/// Verify that characters in the private use area are correctly round-tripped. See #7723.
 #[test]
 fn test_convert_private_use() {
    for c in ENCODE_DIRECT_BASE..ENCODE_DIRECT_END {
-        // Encode the char via the locale. Do not use fish functions which interpret these
-        // specially.
-        let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX];
-        let mut state = zero_mbstate();
-        let len = unsafe {
-            wcrtomb(
-                std::ptr::addr_of_mut!(converted[0]).cast(),
-                c as u32,
-                &mut state,
-            )
-        };
-        if len == 0_usize.wrapping_sub(1) {
-            // Could not be encoded in this locale.
-            continue;
-        }
-        let s = &converted[..len];
-
+        // A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
+        // TODO MSRV(1.92?) replace 4 by `char::MAX_LEN_UTF8` once that's available in our MSRV.
+        // https://doc.rust-lang.org/std/primitive.char.html#associatedconstant.MAX_LEN_UTF8
+        let mut converted = [0_u8; 4];
+        let s = c.encode_utf8(&mut converted).as_bytes();
        // Ask fish to decode this via bytes2wcstring.
        // bytes2wcstring should notice that the decoded form collides with its private use
        // and encode it directly.
--- a/src/wcstringutil.rs
+++ b/src/wcstringutil.rs
@@ -1,10 +1,8 @@
 //! Helper functions for working with wcstring.

-use crate::common::{get_ellipsis_char, get_ellipsis_str, get_is_multibyte_locale};
+use crate::common::{get_ellipsis_char, get_ellipsis_str};
 use crate::fallback::{fish_wcwidth, wcscasecmp, wcscasecmp_fuzzy};
-use crate::flog::FLOGF;
 use crate::wchar::{decode_byte_from_char, prelude::*};
-use crate::wutil::encoding::{AT_LEAST_MB_LEN_MAX, wcrtomb, zero_mbstate};

 /// Return the number of newlines in a string.
 pub fn count_newlines(s: &wstr) -> usize {
@@ -299,56 +297,28 @@ pub fn string_fuzzy_match_string(
 }

 /// Implementation of wcs2bytes that accepts a callback.
-/// This invokes `func` with (const char*, size_t) pairs.
+/// This invokes `func` with byte slices containing the UTF-8 encoding of the characters in the
+/// input, doing one invocation per character.
 /// If `func` returns false, it stops; otherwise it continues.
 /// Return false if the callback returned false, otherwise true.
 pub fn wcs2bytes_callback(input: &wstr, mut func: impl FnMut(&[u8]) -> bool) -> bool {
-    let mut state = zero_mbstate();
-    let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX];
-
-    let is_singlebyte_locale = !get_is_multibyte_locale();
+    // A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
+    let mut converted = [0_u8; 4];

    for c in input.chars() {
-        if let Some(byte) = decode_byte_from_char(c) {
+        let bytes = if let Some(byte) = decode_byte_from_char(c) {
            converted[0] = byte;
-            if !func(&converted[..1]) {
-                return false;
-            }
-        } else if is_singlebyte_locale {
-            // single-byte locale (C/POSIX/ISO-8859)
-            // If `c` contains a wide character we emit a question-mark.
-            converted[0] = u8::try_from(u32::from(c)).unwrap_or(b'?');
-            if !func(&converted[..1]) {
-                return false;
-            }
+            &converted[..=0]
        } else {
-            converted = [0; AT_LEAST_MB_LEN_MAX];
-            let len = unsafe {
-                wcrtomb(
-                    std::ptr::addr_of_mut!(converted[0]).cast(),
-                    c as u32,
-                    &mut state,
-                )
-            };
-            if len == 0_usize.wrapping_sub(1) {
-                wcs2bytes_bad_char(c);
-                state = zero_mbstate();
-            } else if !func(&converted[..len]) {
-                return false;
-            }
+            c.encode_utf8(&mut converted).as_bytes()
+        };
+        if !func(bytes) {
+            return false;
        }
    }
    true
 }

-fn wcs2bytes_bad_char(c: char) {
-    FLOGF!(
-        char_encoding,
-        L!("Wide character U+%4X has no narrow representation"),
-        c
-    );
-}
-
 /// Split a string by runs of any of the separator characters provided in `seps`.
 /// Note the delimiters are the characters in `seps`, not `seps` itself.
 /// `seps` may contain the NUL character.
--- a/src/wutil/encoding.rs
+++ b/src/wutil/encoding.rs
@@ -1,50 +0,0 @@
-extern "C" {
-    #[cfg_attr(cygwin, link_name = "c32rtomb")]
-    pub fn wcrtomb(s: *mut libc::c_char, wc: u32, ps: *mut mbstate_t) -> usize;
-    #[cfg_attr(cygwin, link_name = "mbrtoc32")]
-    pub fn mbrtowc(pwc: *mut u32, s: *const libc::c_char, n: usize, p: *mut mbstate_t) -> usize;
-}
-
-// HACK This should be mbstate_t from libc but that's not exposed.  Since it's only written by
-// libc, we define it as opaque type that should be large enough for all implementations.
-pub type mbstate_t = [u64; 16];
-
-#[inline]
-pub fn zero_mbstate() -> mbstate_t {
-    [0; 16]
-}
-
-// HACK This should be the MB_LEN_MAX macro from libc but that's not easy to get.
-pub const AT_LEAST_MB_LEN_MAX: usize = 32;
-
-/// Return true if we believe we are in a multibyte locale.
-/// Note this reads the current locale and is modestly expensive - prefer the cached
-/// values in `common.rs` which is set by `fish_setlocale`.
-pub fn probe_is_multibyte_locale() -> bool {
-    // In general we would like to read MB_CUR_MAX, but that is not exposed by Rust libc.
-    // Instead, check if mbrtowc for any byte in the range 0-255 returns (size_t)(-2) which indicates
-    // the presence of a multibyte locale.
-    #[inline]
-    fn is_mb_lead(b: u8) -> bool {
-        let mut st = zero_mbstate();
-        let mut wc: libc::wchar_t = 0;
-        let c = b as libc::c_char;
-        let n = unsafe {
-            mbrtowc(
-                std::ptr::addr_of_mut!(wc).cast::<u32>(),
-                std::ptr::addr_of!(c),
-                1,
-                std::ptr::addr_of_mut!(st),
-            )
-        };
-        n == (-2_i64 as libc::size_t)
-    }
-
-    // Fast path: check common lead bytes.
-    if is_mb_lead(0xE0) || is_mb_lead(0xC2) {
-        return true;
-    }
-
-    // Scan non-ASCII high bytes.
-    (0x80_u8..=0xFF).any(is_mb_lead)
-}
--- a/src/wutil/mod.rs
+++ b/src/wutil/mod.rs
@@ -1,5 +1,4 @@
 pub mod dir_iter;
-pub mod encoding;
 pub mod errors;
 pub mod fileid;
 pub mod gettext;
--- a/tests/checks/locale.fish
+++ b/tests/checks/locale.fish
@@ -3,10 +3,6 @@
 # see #7934.
 #REQUIRES: test -z "$GITHUB_WORKFLOW"

-# We typically try to force a utf8-capable locale,
-# this turns that off.
-set -gx fish_allow_singlebyte_locale 1
-
 # A function to display bytes, necessary because GNU and BSD implementations of `od` have different output.
 # We used to use xxd, but it's not available everywhere. See #3797.
 #
@@ -22,73 +18,12 @@ echo -n A\u00FCA | display_bytes
 #CHECK: 0000000 101 303 274 101
 #CHECK: 0000004

-# Verify that exporting a change to the C locale produces the expected output.
-# The output should include the literal byte \xFC rather than the UTF-8 sequence for \u00FC.
-begin
-    set -lx LC_ALL C
-    echo -n B\u00FCB | display_bytes
-end
-#CHECK: 0000000 102 374 102
-#CHECK: 0000003
-
 # Since the previous change was localized to a block it should no
 # longer be in effect and we should be back to a UTF-8 locale.
 echo -n C\u00FCC | display_bytes
 #CHECK: 0000000 103 303 274 103
 #CHECK: 0000004

-# Verify that setting a non-exported locale var doesn't affect the behavior.
-# The output should include the UTF-8 sequence for \u00FC rather than that literal byte.
-# Just like the previous test.
-begin
-    set -l LC_ALL C
-    echo -n D\u00FCD | display_bytes
-end
-#CHECK: 0000000 104 303 274 104
-#CHECK: 0000004
-
-# Verify that fish can pass through non-ASCII characters in the C/POSIX
-# locale. This is to prevent regression of
-# https://github.com/fish-shell/fish-shell/issues/2802.
-#
-# These tests are needed because the relevant standards allow the functions
-# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
-# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
-# Other libc implementations (e.g., BSD) treat them as valid. We want fish to
-# always treat those bytes as valid.
-
-# The fish in the middle of the pipeline should be receiving a UTF-8 encoded
-# version of the unicode from the echo. It should pass those bytes thru
-# literally since it is in the C locale. We verify this by first passing the
-# echo output directly to the `xxd` program then via a fish instance. The
-# output should be "58c3bb58" for the first statement and "58c3bc58" for the
-# second.
-echo -n X\u00FBX | display_bytes
-echo X\u00FCX | env LC_ALL=C $fish -c 'read foo; echo -n $foo' | display_bytes
-#CHECK: 0000000 130 303 273 130
-#CHECK: 0000004
-#CHECK: 0000000 130 303 274 130
-#CHECK: 0000004
-
-# The next tests deliberately spawn another fish instance to test inheritance of env vars.
-
-# This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
-# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
-# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
-# few single-byte unicode chars (that are not ASCII) are generally in the
-# ISO 8859-x char sets which are encompassed by the C locale. The output should
-# be "59fc59".
-env LC_ALL=C $fish -c 'echo -n Y\u00FCY' | display_bytes
-#CHECK: 0000000 131 374 131
-#CHECK: 0000003
-
-# The user can specify a wide unicode character (one requiring more than a
-# single byte). In the C/POSIX locales we substitute a question-mark for the
-# unencodable wide char. The output should be "543f54".
-env LC_ALL=C $fish -c 'echo -n T\u01FDT' | display_bytes
-#CHECK: 0000000 124 077 124
-#CHECK: 0000003
-
 string match ö \Xc3\Xb6
 #CHECK: ö