encoding: use UTF-8 everywhere

Assume that UTF-8 is used everywhere. This allows for significant
simplification of encoding-related functionality. We no longer need
branching on single-byte vs. multi-byte locales, and we can get rid of
all the libc calls for encoding and decoding, replacing them with Rust's
built-in functionality, or removing them without replacement in cases
where their functionality is no longer needed.

Several tests are removed from `tests/checks/locale.fish`, since setting
the locale no longer impacts encoding behavior. We might want more
rigorous testing of UTF-8 handling instead.

Closes #11975
This commit is contained in:
Daniel Rainer
2025-10-18 23:09:10 +02:00
committed by Johannes Altmanninger
parent 755d5ae222
commit c8001b5023
12 changed files with 136 additions and 497 deletions

View File

@@ -9,6 +9,8 @@ Notable improvements and fixes
Deprecations and removed features
---------------------------------
- Fish now assumes UTF-8 everywhere, regardless of locale settings. Input bytes which are not valid UTF-8 should still be round-tripped correctly.
Interactive improvements
------------------------
- :doc:`fish_config prompt {choose,save} <cmds/fish_config>` have been taught to reset :doc:`fish_mode_prompt <cmds/fish_mode_prompt>` in addition to the other prompt functions (:issue:`11937`).

View File

@@ -13,7 +13,7 @@
use crate::env::{EnvVar, EnvVarFlags};
use crate::input_common::DecodeState;
use crate::input_common::InvalidPolicy;
use crate::input_common::decode_input_byte;
use crate::input_common::decode_one_codepoint_utf8;
use crate::nix::isatty;
use crate::reader::ReaderConfig;
use crate::reader::commandline_set_buffer;
@@ -27,7 +27,6 @@
use crate::wcstringutil::split_about;
use crate::wcstringutil::split_string_tok;
use crate::wutil;
use crate::wutil::encoding::zero_mbstate;
use crate::wutil::perror;
use libc::SEEK_CUR;
use std::num::NonZeroUsize;
@@ -389,8 +388,6 @@ fn read_one_char_at_a_time(
let mut unconsumed = vec![];
loop {
let mut state = zero_mbstate();
let chars_read = buff.len();
let res = loop {
let mut b = [0_u8; 1];
@@ -400,17 +397,9 @@ fn read_one_char_at_a_time(
}
_ => {}
}
let b = b[0];
unconsumed.push(b);
unconsumed.push(b[0]);
nbytes += 1;
let mut consumed = 0;
match decode_input_byte(
buff,
InvalidPolicy::Passthrough,
&mut state,
&unconsumed,
&mut consumed,
) {
match decode_one_codepoint_utf8(buff, InvalidPolicy::Passthrough, &unconsumed) {
DecodeState::Incomplete => continue,
DecodeState::Complete => {
unconsumed.clear();

View File

@@ -16,9 +16,6 @@
use crate::wchar::{decode_byte_from_char, encode_byte_to_char, prelude::*};
use crate::wcstringutil::wcs2bytes_callback;
use crate::wildcard::{ANY_CHAR, ANY_STRING, ANY_STRING_RECURSIVE};
use crate::wutil::encoding::{
AT_LEAST_MB_LEN_MAX, mbrtowc, probe_is_multibyte_locale, wcrtomb, zero_mbstate,
};
use crate::wutil::fish_iswalnum;
use bitflags::bitflags;
use libc::{SIG_IGN, SIGTTOU, STDIN_FILENO};
@@ -190,7 +187,7 @@ fn escape_string_script(input: &wstr, flags: EscapeFlags) -> WString {
let no_quoted = flags.contains(EscapeFlags::NO_QUOTED);
let no_tilde = flags.contains(EscapeFlags::NO_TILDE);
let no_qmark = feature_test(FeatureFlag::qmark_noglob);
let symbolic = flags.contains(EscapeFlags::SYMBOLIC) && get_is_multibyte_locale();
let symbolic = flags.contains(EscapeFlags::SYMBOLIC);
assert!(
!symbolic || !escape_printables,
@@ -1035,19 +1032,15 @@ pub fn shell_modes() -> MutexGuard<'static, libc::termios> {
/// The character to use where the text has been truncated. Is an ellipsis on unicode system and a $
/// on other systems.
pub fn get_ellipsis_char() -> char {
char::from_u32(ELLIPSIS_CHAR.load(Ordering::Relaxed)).unwrap()
'\u{2026}'
}
static ELLIPSIS_CHAR: AtomicU32 = AtomicU32::new(0);
/// The character or string to use where text has been truncated (ellipsis if possible, otherwise
/// ...)
pub fn get_ellipsis_str() -> &'static wstr {
ELLIPSIS_STRING.load()
L!("\u{2026}")
}
static ELLIPSIS_STRING: AtomicRef<wstr> = AtomicRef::new(&L!(""));
/// Character representing an omitted newline at the end of text.
pub fn get_omitted_newline_str() -> &'static wstr {
OMITTED_NEWLINE_STR.load()
@@ -1065,13 +1058,6 @@ pub fn get_obfuscation_read_char() -> char {
char::from_u32(OBFUSCATION_READ_CHAR.load(Ordering::Relaxed)).unwrap()
}
static IS_MB_LOCALE: RelaxedAtomicBool = RelaxedAtomicBool::new(false);
/// Whether we believe we are in a multibyte locale.
pub fn get_is_multibyte_locale() -> bool {
IS_MB_LOCALE.load()
}
/// Profiling flag. True if commands should be profiled.
pub static PROFILING_ACTIVE: RelaxedAtomicBool = RelaxedAtomicBool::new(false);
@@ -1102,84 +1088,54 @@ pub fn has_working_tty_timestamps() -> bool {
/// todo!("Maybe remove the box? It is only needed for get_bg_context.")
pub type CancelChecker = Box<dyn Fn() -> bool>;
/// Converts the narrow character string \c in into its wide equivalent, and return it.
///
/// The string may contain embedded nulls.
///
/// This function encodes illegal character sequences in a reversible way using the private use
/// area.
pub fn bytes2wcstring(inp: &[u8]) -> WString {
if inp.is_empty() {
/// Encodes the bytes in `input` into a [`WString`], encoding non-UTF-8 bytes into private-use-area
/// code-points. Bytes which would be parsed into our reserved PUA range are encoded individually,
/// to allow for correct round-tripping.
pub fn bytes2wcstring(mut input: &[u8]) -> WString {
if input.is_empty() {
return WString::new();
}
let mut result = WString::new();
result.reserve(inp.len());
let mut pos = 0;
let mut state = zero_mbstate();
while pos < inp.len() {
// Append any initial sequence of ascii characters.
// Note we do not support character sets which are not supersets of ASCII.
let ascii_prefix_length = count_ascii_prefix(&inp[pos..]);
result.push_str(std::str::from_utf8(&inp[pos..pos + ascii_prefix_length]).unwrap());
pos += ascii_prefix_length;
assert!(pos <= inp.len(), "Position overflowed length");
if pos == inp.len() {
break;
}
// We have found a non-ASCII character.
let mut ret = 0;
let mut c = '\0';
let use_encode_direct = if inp[pos] & 0xF8 == 0xF8 {
// Protect against broken mbrtowc() implementations which attempt to encode UTF-8
// sequences longer than four bytes (e.g., OS X Snow Leopard).
// TODO This check used to be conditionally compiled only on affected platforms.
true
} else {
let mut codepoint = u32::from(c);
ret = unsafe {
mbrtowc(
std::ptr::addr_of_mut!(codepoint),
std::ptr::addr_of!(inp[pos]).cast(),
inp.len() - pos,
&mut state,
)
};
match char::from_u32(codepoint) {
Some(codepoint) => {
c = codepoint;
// Determine whether to encode this character with our crazy scheme.
fish_reserved_codepoint(c)
||
// Incomplete sequence.
ret == 0_usize.wrapping_sub(2)
||
// Invalid data.
ret == 0_usize.wrapping_sub(1)
||
// Other error codes? Terrifying, should never happen.
ret > inp.len() - pos
fn append_escaped_str(output: &mut WString, input: &str) {
for (i, c) in input.char_indices() {
if fish_reserved_codepoint(c) {
for byte in &input.as_bytes()[i..i + c.len_utf8()] {
output.push(encode_byte_to_char(*byte));
}
None => true,
} else {
output.push(c);
}
};
}
}
if use_encode_direct {
c = encode_byte_to_char(inp[pos]);
result.push(c);
pos += 1;
state = zero_mbstate();
} else if ret == 0 {
// embedded null byte!
result.push('\0');
pos += 1;
state = zero_mbstate();
} else {
// normal case
result.push(c);
pos += ret;
while !input.is_empty() {
match std::str::from_utf8(input) {
Ok(parsed_str) => {
append_escaped_str(&mut result, parsed_str);
// The entire remaining input could be parsed, so we are done.
break;
}
Err(e) => {
let (valid, after_valid) = input.split_at(e.valid_up_to());
// SAFETY: The previous `str::from_utf8` call established that the prefix `valid`
// is valid UTF-8. This prefix may be empty.
let parsed_str = unsafe { std::str::from_utf8_unchecked(valid) };
append_escaped_str(&mut result, parsed_str);
// The length of the prefix of `after_valid` which is invalid UTF-8.
// The remaining bytes of `input` (if any) will be parsed in subsequent iterations
// of the loop, starting from the first byte that starts a valid UTF-8-encoded codepoint.
// `error_len` can return `None`, if it sees a byte sequence that could be the
// prefix of a valid code-point encoding at the end of the byte slice.
// This is useful when the input is chunked, but we don't do that, so in this case
// we use our custom encoding for all remaining bytes (at most 3).
let error_len = e.error_len().unwrap_or(after_valid.len());
for byte in &after_valid[..error_len] {
result.push(encode_byte_to_char(*byte));
}
input = &after_valid[error_len..];
}
}
}
result
@@ -1265,12 +1221,6 @@ pub fn wcs2bytes_appending(output: &mut Vec<u8>, input: &wstr) {
});
}
/// Return the count of initial characters in `in` which are ASCII.
fn count_ascii_prefix(inp: &[u8]) -> usize {
// The C++ version had manual vectorization.
inp.iter().take_while(|c| c.is_ascii()).count()
}
// Check if we are running in the test mode, where we should suppress error output
pub const TESTS_PROGRAM_NAME: &wstr = L!("(ignore)");
@@ -1302,22 +1252,6 @@ macro_rules! LL {
}};
}
// Mark if we are a multibyte locale.
IS_MB_LOCALE.store(probe_is_multibyte_locale());
// Use various Unicode symbols if they can be encoded using the current locale, else a simple
// ASCII char alternative. All of the can_be_encoded() invocations should return the same
// true/false value since the code points are in the BMP but we're going to be paranoid. This
// is also technically wrong if we're not in a Unicode locale but we expect (or hope)
// can_be_encoded() will return false in that case.
if can_be_encoded('\u{2026}') {
ELLIPSIS_CHAR.store(u32::from('\u{2026}'), Ordering::Relaxed);
ELLIPSIS_STRING.store(LL!("\u{2026}"));
} else {
ELLIPSIS_CHAR.store(u32::from('$'), Ordering::Relaxed); // "horizontal ellipsis"
ELLIPSIS_STRING.store(LL!("..."));
}
if is_windows_subsystem_for_linux(WSL::Any) {
// neither of \u23CE and \u25CF can be displayed in the default fonts on Windows, though
// they can be *encoded* just fine. Use alternative glyphs.
@@ -1327,29 +1261,16 @@ macro_rules! LL {
OMITTED_NEWLINE_STR.store(LL!("^J"));
OBFUSCATION_READ_CHAR.store(u32::from('*'), Ordering::Relaxed);
} else {
if can_be_encoded('\u{23CE}') {
OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎)
} else {
OMITTED_NEWLINE_STR.store(LL!("^J"));
}
OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎)
OBFUSCATION_READ_CHAR.store(
u32::from(if can_be_encoded('\u{25CF}') {
'\u{25CF}' // "black circle"
} else {
'#'
}),
u32::from(
'\u{25CF}', // "black circle"
),
Ordering::Relaxed,
);
}
}
/// Test if the character can be encoded using the current locale.
fn can_be_encoded(wc: char) -> bool {
let mut converted = [0 as libc::c_char; AT_LEAST_MB_LEN_MAX];
let mut state = zero_mbstate();
unsafe { wcrtomb(converted.as_mut_ptr(), wc as u32, &mut state) != 0_usize.wrapping_sub(1) }
}
/// Call read, blocking and repeating on EINTR. Exits on EAGAIN.
/// Return the number of bytes read, or 0 on EOF, or an error.
pub fn read_blocked(fd: RawFd, buf: &mut [u8]) -> nix::Result<usize> {

View File

@@ -14,7 +14,6 @@
use crate::terminal::use_terminfo;
use crate::tty_handoff::xtversion;
use crate::wchar::prelude::*;
use crate::wutil::encoding::probe_is_multibyte_locale;
use crate::wutil::fish_wcstoi;
use crate::{function, terminal};
use std::borrow::Cow;
@@ -25,12 +24,16 @@
/// List of all locale environment variable names that might trigger (re)initializing of the locale
/// subsystem. These are only the variables we're possibly interested in.
#[rustfmt::skip]
const LOCALE_VARIABLES: [&wstr; 10] = [
L!("LANG"), L!("LANGUAGE"), L!("LC_ALL"),
L!("LC_COLLATE"), L!("LC_CTYPE"), L!("LC_MESSAGES"),
L!("LC_NUMERIC"), L!("LC_TIME"), L!("LOCPATH"),
L!("fish_allow_singlebyte_locale"),
const LOCALE_VARIABLES: [&wstr; 9] = [
L!("LANG"),
L!("LANGUAGE"),
L!("LC_ALL"),
L!("LC_COLLATE"),
L!("LC_CTYPE"),
L!("LC_MESSAGES"),
L!("LC_NUMERIC"),
L!("LC_TIME"),
L!("LOCPATH"),
];
#[rustfmt::skip]
@@ -299,8 +302,6 @@ fn handle_tz_change(var_name: &wstr, vars: &EnvStack) {
fn handle_locale_change(vars: &EnvStack) {
init_locale(vars);
// We need to re-guess emoji width because the locale might have changed to a multibyte one.
guess_emoji_width(vars);
}
fn handle_term_change(vars: &EnvStack) {
@@ -502,11 +503,6 @@ pub fn read_terminfo_database(vars: &EnvStack) {
fn init_locale(vars: &EnvStack) {
let _guard = crate::locale::LOCALE_LOCK.lock().unwrap();
#[rustfmt::skip]
const UTF8_LOCALES: &[&str] = &[
"C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8",
];
let old_msg_locale: CString = {
let old = unsafe { libc::setlocale(libc::LC_MESSAGES, ptr::null()) };
assert_ne!(old, ptr::null_mut());
@@ -541,33 +537,6 @@ fn init_locale(vars: &EnvStack) {
}
};
// Try to get a multibyte-capable encoding.
// A "C" locale is broken for our purposes: any wchar function will break on it. So we try
// *really, really, really hard* to not have one.
let fix_locale = vars
.get_unless_empty(L!("fish_allow_singlebyte_locale"))
.map(|v| v.as_string())
.map(|allow_c| !crate::wcstringutil::bool_from_string(&allow_c))
.unwrap_or(true);
if fix_locale && !probe_is_multibyte_locale() {
FLOG!(env_locale, "Have single byte locale, trying to fix.");
let mut fixed = false;
for locale in UTF8_LOCALES {
let locale_cstr = CString::new(*locale).unwrap();
// this can fail, that is fine
unsafe { libc::setlocale(libc::LC_CTYPE, locale_cstr.as_ptr()) };
if probe_is_multibyte_locale() {
FLOG!(env_locale, "Fixed locale:", locale);
fixed = true;
break;
}
}
if !fixed {
FLOG!(env_locale, "Failed to fix locale.");
}
}
// We *always* use a C-locale for numbers because we want '.' (except for in printf).
let loc_ptr = unsafe { libc::setlocale(libc::LC_NUMERIC, c"C".as_ptr().cast()) };
// should never fail, the C locale should always be defined

View File

@@ -1,6 +1,6 @@
use crate::common::{
WSL, bytes2wcstring, fish_reserved_codepoint, get_is_multibyte_locale,
is_windows_subsystem_for_linux, read_blocked, shell_modes,
WSL, bytes2wcstring, fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked,
shell_modes,
};
use crate::env::{EnvStack, Environment};
use crate::fd_readable_set::{FdReadableSet, Timeout};
@@ -17,7 +17,6 @@
};
use crate::universal_notifier::default_notifier;
use crate::wchar::{encode_byte_to_char, prelude::*};
use crate::wutil::encoding::{mbrtowc, mbstate_t, zero_mbstate};
use crate::wutil::{fish_is_pua, fish_wcstol};
use std::cell::{RefCell, RefMut};
use std::collections::VecDeque;
@@ -826,9 +825,7 @@ fn try_pop(&mut self) -> Option<CharEvent> {
self.get_input_data_mut().queue.pop_front()
}
/// Function used by [`readch`](Self::readch) to read bytes from stdin until enough bytes have been read to
/// convert them to a wchar_t. Conversion is done using mbrtowc. If a character has previously
/// been read and then 'unread' using \c input_common_unreadch, that character is returned.
/// Read the next event, such as a UTF-8-encoded codepoint.
fn readch(&mut self) -> CharEvent {
loop {
// Do we have something enqueued already?
@@ -871,7 +868,7 @@ fn readch(&mut self) -> CharEvent {
InputEventTrigger::Byte(read_byte) => {
let mut have_escape_prefix = false;
let mut buffer = vec![read_byte];
let key_with_escape = if read_byte == 0x1b {
let mut key = if read_byte == 0x1b {
self.parse_escape_sequence(&mut buffer, &mut have_escape_prefix)
} else {
canonicalize_control_char(read_byte).map(KeyEvent::from)
@@ -883,47 +880,35 @@ fn readch(&mut self) -> CharEvent {
continue;
}
let mut seq = WString::new();
let mut key = key_with_escape;
if key.is_some_and(|key| key.key == Key::from_raw(key::Invalid)) {
continue;
}
assert!(key.is_none_or(|key| key.codepoint != key::Invalid));
let mut consumed = 0;
let mut state = zero_mbstate();
let mut i = 0;
// At this point, the bytes in `buffer` should be parsed as a UTF-8 sequence,
// or, if they are not valid UTF-8, ignored. On incomplete sequences, another
// byte is read and decoding is tried again in the next iteration.
let ok = loop {
if i == buffer.len() {
buffer.push(
match next_input_event(self.get_in_fd(), Timeout::Forever) {
InputEventTrigger::Byte(b) => b,
_ => 0,
},
);
}
match decode_input_byte(
&mut seq,
InvalidPolicy::Error,
&mut state,
&buffer[..i + 1],
&mut consumed,
) {
DecodeState::Incomplete => (),
match decode_one_codepoint_utf8(&mut seq, InvalidPolicy::Error, &buffer) {
DecodeState::Incomplete => {
buffer.push(
match next_input_event(self.get_in_fd(), Timeout::Forever) {
InputEventTrigger::Byte(b) => b,
_ => 0,
},
);
}
DecodeState::Complete => {
if have_escape_prefix && i != 0 {
have_escape_prefix = false;
if have_escape_prefix {
let c = seq.as_char_slice().last().unwrap();
key = Some(KeyEvent::from(alt(*c)));
}
if i + 1 == buffer.len() {
break true;
}
break true;
}
DecodeState::Error => {
self.push_front(CharEvent::from_check_exit());
break false;
}
}
i += 1;
};
if !ok {
continue;
@@ -1686,63 +1671,37 @@ pub(crate) enum InvalidPolicy {
Passthrough,
}
pub(crate) fn decode_input_byte(
pub(crate) fn decode_one_codepoint_utf8(
out_seq: &mut WString,
invalid_policy: InvalidPolicy,
state: &mut mbstate_t,
buffer: &[u8],
consumed: &mut usize,
) -> DecodeState {
use DecodeState::*;
let mut res: char = '\0';
let read_byte = *buffer.last().unwrap();
if !get_is_multibyte_locale() {
// single-byte locale, all values are legal
res = read_byte.into();
out_seq.push(res);
return Complete;
}
let mut invalid = |out_seq: &mut WString, log_error: fn()| match invalid_policy {
InvalidPolicy::Error => {
(log_error)();
Error
}
InvalidPolicy::Passthrough => {
for &b in &buffer[*consumed..] {
out_seq.push(encode_byte_to_char(b));
match std::str::from_utf8(buffer) {
Ok(parsed_str) => {
for c in parsed_str.chars() {
if !fish_reserved_codepoint(c) {
out_seq.push(c);
}
}
*consumed = buffer.len();
Complete
}
};
let mut codepoint = u32::from(res);
match unsafe {
mbrtowc(
std::ptr::addr_of_mut!(codepoint),
std::ptr::addr_of!(read_byte).cast(),
1,
state,
)
} as isize
{
-1 => {
return invalid(out_seq, || FLOG!(reader, "Illegal input encoding"));
}
-2 => {
// Sequence not yet complete.
return Incomplete;
}
_ => (),
Err(e) => match e.error_len() {
Some(_) => match invalid_policy {
InvalidPolicy::Error => {
FLOG!(reader, "Illegal input encoding");
Error
}
InvalidPolicy::Passthrough => {
for &b in buffer {
out_seq.push(encode_byte_to_char(b));
}
Complete
}
},
None => Incomplete,
},
}
if let Some(res) = char::from_u32(codepoint) {
// Sequence complete.
if !fish_reserved_codepoint(res) {
*consumed += 1;
out_seq.push(res);
return Complete;
}
}
invalid(out_seq, || FLOG!(reader, "Illegal codepoint"))
}
pub(crate) fn stop_query(mut query: RefMut<'_, Option<TerminalQuery>>) -> bool {

View File

@@ -5,7 +5,6 @@
use crate::common::{
EscapeFlags, EscapeStringStyle, escape_string, get_ellipsis_char, get_ellipsis_str,
get_is_multibyte_locale,
};
use crate::complete::Completion;
use crate::editable_line::EditableLine;
@@ -1231,19 +1230,16 @@ fn process_completions_into_infos(lst: &[Completion]) -> Vec<PagerComp> {
EscapeFlags::NO_PRINTABLES | EscapeFlags::NO_QUOTED | EscapeFlags::SYMBOLIC,
),
));
if comp.replaces_line()
// HACK We want to render a full shell command, with syntax highlighting. Above we
// escape nonprintables, which might make the rendered command longer than the original
// completion. In that case we get wrong colors. However this should only happen in
// contrived cases, since our symbolic escaping uses a single character to represent
// newline and tab characters; other nonprintables are extremely rare in a command
// line. It will only be common for single-byte locales where we don't
// use Unicode characters for escaping, so just disable those here.
// We should probably fix this by first highlighting the original completion, and
// then writing a variant of escape_string() that adjusts highlighting according
// so it matches the escaped string.
&& get_is_multibyte_locale()
{
// HACK We want to render a full shell command, with syntax highlighting. Above we
// escape nonprintables, which might make the rendered command longer than the original
// completion. In that case we get wrong colors. However this should only happen in
// contrived cases, since our symbolic escaping uses a single character to represent
// newline and tab characters; other nonprintables are extremely rare in a command
// line.
// We should probably fix this by first highlighting the original completion, and
// then writing a variant of escape_string() that adjusts highlighting according
// so it matches the escaped string.
if comp.replaces_line() {
highlight_shell(
&comp.completion,
&mut comp_info.colors,

View File

@@ -55,9 +55,8 @@
use crate::common::ScopeGuarding;
use crate::common::{
EscapeFlags, EscapeStringStyle, PROGRAM_NAME, ScopeGuard, UTF8_BOM_WCHAR, bytes2wcstring,
escape, escape_string, exit_without_destructors, get_ellipsis_char, get_is_multibyte_locale,
get_obfuscation_read_char, restore_term_foreground_process_group_for_exit, shell_modes,
write_loop,
escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char,
restore_term_foreground_process_group_for_exit, shell_modes, write_loop,
};
use crate::complete::{
CompleteFlags, Completion, CompletionList, CompletionRequestOptions, complete, complete_load,
@@ -3220,14 +3219,7 @@ fn handle_readline_command(&mut self, c: ReadlineCmd) {
self.history_pager = Some(0..1);
// Update the pager data.
self.pager.set_search_field_shown(true);
self.pager.set_prefix(
if get_is_multibyte_locale() {
L!("")
} else {
L!("> ")
},
/*highlight=*/ false,
);
self.pager.set_prefix(L!(""), false);
// Update the search field, which triggers the actual history search.
let search_string = if !self.history_search.active()
|| self.history_search.search_string().is_empty()

View File

@@ -1,40 +1,11 @@
use std::sync::MutexGuard;
use crate::common::{
ENCODE_DIRECT_BASE, ENCODE_DIRECT_END, EscapeFlags, EscapeStringStyle, UnescapeStringStyle,
bytes2wcstring, escape_string, fish_setlocale, unescape_string, wcs2bytes,
bytes2wcstring, escape_string, unescape_string, wcs2bytes,
};
use crate::locale::LOCALE_LOCK;
use crate::util::{get_rng_seed, get_seeded_rng};
use crate::wchar::{L, WString, wstr};
use crate::wutil::encoding::{
AT_LEAST_MB_LEN_MAX, probe_is_multibyte_locale, wcrtomb, zero_mbstate,
};
use rand::{Rng, RngCore};
/// wcs2bytes is locale-dependent, so ensure we have a multibyte locale
/// before using it in a test.
fn setlocale() -> MutexGuard<'static, ()> {
let guard = LOCALE_LOCK.lock().unwrap();
#[rustfmt::skip]
const UTF8_LOCALES: &[&str] = &[
"C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8",
];
if probe_is_multibyte_locale() {
return guard;
}
for locale in UTF8_LOCALES {
let locale = std::ffi::CString::new(locale.to_owned()).unwrap();
unsafe { libc::setlocale(libc::LC_CTYPE, locale.as_ptr()) };
if probe_is_multibyte_locale() {
fish_setlocale(); // Update cached locale information.
return guard;
}
}
panic!("No UTF-8 locale found");
}
#[test]
fn test_escape_string() {
let regex = |input| escape_string(input, EscapeStringStyle::Regex);
@@ -105,7 +76,6 @@ fn test_escape_var() {
}
fn escape_test(escape_style: EscapeStringStyle, unescape_style: UnescapeStringStyle) {
let _locale_guard = setlocale();
let seed: u128 = 92348567983274852905629743984572;
let mut rng = get_seeded_rng(seed);
@@ -185,7 +155,6 @@ fn bytes2hex(input: &[u8]) -> String {
/// string comes back through double conversion.
#[test]
fn test_convert() {
let _locale_guard = setlocale();
let seed = get_rng_seed();
let mut rng = get_seeded_rng(seed);
let mut origin = Vec::new();
@@ -241,30 +210,18 @@ fn test_convert_ascii() {
}
}
/// fish uses the private-use range to encode bytes that could not be decoded using the
/// user's locale. If the input could be decoded, but decoded to private-use codepoints,
/// then fish should also use the direct encoding for those bytes. Verify that characters
/// in the private use area are correctly round-tripped. See #7723.
/// fish uses the private-use range to encode bytes that are not valid UTF-8.
/// If the input decodes to these private-use codepoints,
/// then fish should also use the direct encoding for those bytes.
/// Verify that characters in the private use area are correctly round-tripped. See #7723.
#[test]
fn test_convert_private_use() {
for c in ENCODE_DIRECT_BASE..ENCODE_DIRECT_END {
// Encode the char via the locale. Do not use fish functions which interpret these
// specially.
let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX];
let mut state = zero_mbstate();
let len = unsafe {
wcrtomb(
std::ptr::addr_of_mut!(converted[0]).cast(),
c as u32,
&mut state,
)
};
if len == 0_usize.wrapping_sub(1) {
// Could not be encoded in this locale.
continue;
}
let s = &converted[..len];
// A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
// TODO MSRV(1.92?) replace 4 by `char::MAX_LEN_UTF8` once that's available in our MSRV.
// https://doc.rust-lang.org/std/primitive.char.html#associatedconstant.MAX_LEN_UTF8
let mut converted = [0_u8; 4];
let s = c.encode_utf8(&mut converted).as_bytes();
// Ask fish to decode this via bytes2wcstring.
// bytes2wcstring should notice that the decoded form collides with its private use
// and encode it directly.

View File

@@ -1,10 +1,8 @@
//! Helper functions for working with wcstring.
use crate::common::{get_ellipsis_char, get_ellipsis_str, get_is_multibyte_locale};
use crate::common::{get_ellipsis_char, get_ellipsis_str};
use crate::fallback::{fish_wcwidth, wcscasecmp, wcscasecmp_fuzzy};
use crate::flog::FLOGF;
use crate::wchar::{decode_byte_from_char, prelude::*};
use crate::wutil::encoding::{AT_LEAST_MB_LEN_MAX, wcrtomb, zero_mbstate};
/// Return the number of newlines in a string.
pub fn count_newlines(s: &wstr) -> usize {
@@ -299,56 +297,28 @@ pub fn string_fuzzy_match_string(
}
/// Implementation of wcs2bytes that accepts a callback.
/// This invokes `func` with (const char*, size_t) pairs.
/// This invokes `func` with byte slices containing the UTF-8 encoding of the characters in the
/// input, doing one invocation per character.
/// If `func` returns false, it stops; otherwise it continues.
/// Return false if the callback returned false, otherwise true.
pub fn wcs2bytes_callback(input: &wstr, mut func: impl FnMut(&[u8]) -> bool) -> bool {
let mut state = zero_mbstate();
let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX];
let is_singlebyte_locale = !get_is_multibyte_locale();
// A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
let mut converted = [0_u8; 4];
for c in input.chars() {
if let Some(byte) = decode_byte_from_char(c) {
let bytes = if let Some(byte) = decode_byte_from_char(c) {
converted[0] = byte;
if !func(&converted[..1]) {
return false;
}
} else if is_singlebyte_locale {
// single-byte locale (C/POSIX/ISO-8859)
// If `c` contains a wide character we emit a question-mark.
converted[0] = u8::try_from(u32::from(c)).unwrap_or(b'?');
if !func(&converted[..1]) {
return false;
}
&converted[..=0]
} else {
converted = [0; AT_LEAST_MB_LEN_MAX];
let len = unsafe {
wcrtomb(
std::ptr::addr_of_mut!(converted[0]).cast(),
c as u32,
&mut state,
)
};
if len == 0_usize.wrapping_sub(1) {
wcs2bytes_bad_char(c);
state = zero_mbstate();
} else if !func(&converted[..len]) {
return false;
}
c.encode_utf8(&mut converted).as_bytes()
};
if !func(bytes) {
return false;
}
}
true
}
fn wcs2bytes_bad_char(c: char) {
FLOGF!(
char_encoding,
L!("Wide character U+%4X has no narrow representation"),
c
);
}
/// Split a string by runs of any of the separator characters provided in `seps`.
/// Note the delimiters are the characters in `seps`, not `seps` itself.
/// `seps` may contain the NUL character.

View File

@@ -1,50 +0,0 @@
extern "C" {
#[cfg_attr(cygwin, link_name = "c32rtomb")]
pub fn wcrtomb(s: *mut libc::c_char, wc: u32, ps: *mut mbstate_t) -> usize;
#[cfg_attr(cygwin, link_name = "mbrtoc32")]
pub fn mbrtowc(pwc: *mut u32, s: *const libc::c_char, n: usize, p: *mut mbstate_t) -> usize;
}
// HACK This should be mbstate_t from libc but that's not exposed. Since it's only written by
// libc, we define it as opaque type that should be large enough for all implementations.
pub type mbstate_t = [u64; 16];
#[inline]
pub fn zero_mbstate() -> mbstate_t {
[0; 16]
}
// HACK This should be the MB_LEN_MAX macro from libc but that's not easy to get.
pub const AT_LEAST_MB_LEN_MAX: usize = 32;
/// Return true if we believe we are in a multibyte locale.
/// Note this reads the current locale and is modestly expensive - prefer the cached
/// values in `common.rs` which is set by `fish_setlocale`.
pub fn probe_is_multibyte_locale() -> bool {
// In general we would like to read MB_CUR_MAX, but that is not exposed by Rust libc.
// Instead, check if mbrtowc for any byte in the range 0-255 returns (size_t)(-2) which indicates
// the presence of a multibyte locale.
#[inline]
fn is_mb_lead(b: u8) -> bool {
let mut st = zero_mbstate();
let mut wc: libc::wchar_t = 0;
let c = b as libc::c_char;
let n = unsafe {
mbrtowc(
std::ptr::addr_of_mut!(wc).cast::<u32>(),
std::ptr::addr_of!(c),
1,
std::ptr::addr_of_mut!(st),
)
};
n == (-2_i64 as libc::size_t)
}
// Fast path: check common lead bytes.
if is_mb_lead(0xE0) || is_mb_lead(0xC2) {
return true;
}
// Scan non-ASCII high bytes.
(0x80_u8..=0xFF).any(is_mb_lead)
}

View File

@@ -1,5 +1,4 @@
pub mod dir_iter;
pub mod encoding;
pub mod errors;
pub mod fileid;
pub mod gettext;

View File

@@ -3,10 +3,6 @@
# see #7934.
#REQUIRES: test -z "$GITHUB_WORKFLOW"
# We typically try to force a utf8-capable locale,
# this turns that off.
set -gx fish_allow_singlebyte_locale 1
# A function to display bytes, necessary because GNU and BSD implementations of `od` have different output.
# We used to use xxd, but it's not available everywhere. See #3797.
#
@@ -22,73 +18,12 @@ echo -n A\u00FCA | display_bytes
#CHECK: 0000000 101 303 274 101
#CHECK: 0000004
# Verify that exporting a change to the C locale produces the expected output.
# The output should include the literal byte \xFC rather than the UTF-8 sequence for \u00FC.
begin
set -lx LC_ALL C
echo -n B\u00FCB | display_bytes
end
#CHECK: 0000000 102 374 102
#CHECK: 0000003
# Since the previous change was localized to a block it should no
# longer be in effect and we should be back to a UTF-8 locale.
echo -n C\u00FCC | display_bytes
#CHECK: 0000000 103 303 274 103
#CHECK: 0000004
# Verify that setting a non-exported locale var doesn't affect the behavior.
# The output should include the UTF-8 sequence for \u00FC rather than that literal byte.
# Just like the previous test.
begin
set -l LC_ALL C
echo -n D\u00FCD | display_bytes
end
#CHECK: 0000000 104 303 274 104
#CHECK: 0000004
# Verify that fish can pass through non-ASCII characters in the C/POSIX
# locale. This is to prevent regression of
# https://github.com/fish-shell/fish-shell/issues/2802.
#
# These tests are needed because the relevant standards allow the functions
# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
# Other libc implementations (e.g., BSD) treat them as valid. We want fish to
# always treat those bytes as valid.
# The fish in the middle of the pipeline should be receiving a UTF-8 encoded
# version of the unicode from the echo. It should pass those bytes thru
# literally since it is in the C locale. We verify this by first passing the
# echo output directly to the `xxd` program then via a fish instance. The
# output should be "58c3bb58" for the first statement and "58c3bc58" for the
# second.
echo -n X\u00FBX | display_bytes
echo X\u00FCX | env LC_ALL=C $fish -c 'read foo; echo -n $foo' | display_bytes
#CHECK: 0000000 130 303 273 130
#CHECK: 0000004
#CHECK: 0000000 130 303 274 130
#CHECK: 0000004
# The next tests deliberately spawn another fish instance to test inheritance of env vars.
# This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
# few single-byte unicode chars (that are not ASCII) are generally in the
# ISO 8859-x char sets which are encompassed by the C locale. The output should
# be "59fc59".
env LC_ALL=C $fish -c 'echo -n Y\u00FCY' | display_bytes
#CHECK: 0000000 131 374 131
#CHECK: 0000003
# The user can specify a wide unicode character (one requiring more than a
# single byte). In the C/POSIX locales we substitute a question-mark for the
# unencodable wide char. The output should be "543f54".
env LC_ALL=C $fish -c 'echo -n T\u01FDT' | display_bytes
#CHECK: 0000000 124 077 124
#CHECK: 0000003
string match ö \Xc3\Xb6
#CHECK: ö