mirror of
https://github.com/fish-shell/fish-shell.git
synced 2026-06-03 23:11:14 -03:00
encoding: use UTF-8 everywhere
Assume that UTF-8 is used everywhere. This allows for significant simplification of encoding-related functionality. We no longer need branching on single-byte vs. multi-byte locales, and we can get rid of all the libc calls for encoding and decoding, replacing them with Rust's built-in functionality, or removing them without replacement in cases where their functionality is no longer needed. Several tests are removed from `tests/checks/locale.fish`, since setting the locale no longer impacts encoding behavior. We might want more rigorous testing of UTF-8 handling instead. Closes #11975
This commit is contained in:
committed by
Johannes Altmanninger
parent
755d5ae222
commit
c8001b5023
@@ -9,6 +9,8 @@ Notable improvements and fixes
|
||||
Deprecations and removed features
|
||||
---------------------------------
|
||||
|
||||
- Fish now assumes UTF-8 everywhere, regardless of locale settings. Input bytes which are not valid UTF-8 should still be round-tripped correctly.
|
||||
|
||||
Interactive improvements
|
||||
------------------------
|
||||
- :doc:`fish_config prompt {choose,save} <cmds/fish_config>` have been taught to reset :doc:`fish_mode_prompt <cmds/fish_mode_prompt>` in addition to the other prompt functions (:issue:`11937`).
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
use crate::env::{EnvVar, EnvVarFlags};
|
||||
use crate::input_common::DecodeState;
|
||||
use crate::input_common::InvalidPolicy;
|
||||
use crate::input_common::decode_input_byte;
|
||||
use crate::input_common::decode_one_codepoint_utf8;
|
||||
use crate::nix::isatty;
|
||||
use crate::reader::ReaderConfig;
|
||||
use crate::reader::commandline_set_buffer;
|
||||
@@ -27,7 +27,6 @@
|
||||
use crate::wcstringutil::split_about;
|
||||
use crate::wcstringutil::split_string_tok;
|
||||
use crate::wutil;
|
||||
use crate::wutil::encoding::zero_mbstate;
|
||||
use crate::wutil::perror;
|
||||
use libc::SEEK_CUR;
|
||||
use std::num::NonZeroUsize;
|
||||
@@ -389,8 +388,6 @@ fn read_one_char_at_a_time(
|
||||
let mut unconsumed = vec![];
|
||||
|
||||
loop {
|
||||
let mut state = zero_mbstate();
|
||||
|
||||
let chars_read = buff.len();
|
||||
let res = loop {
|
||||
let mut b = [0_u8; 1];
|
||||
@@ -400,17 +397,9 @@ fn read_one_char_at_a_time(
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
let b = b[0];
|
||||
unconsumed.push(b);
|
||||
unconsumed.push(b[0]);
|
||||
nbytes += 1;
|
||||
let mut consumed = 0;
|
||||
match decode_input_byte(
|
||||
buff,
|
||||
InvalidPolicy::Passthrough,
|
||||
&mut state,
|
||||
&unconsumed,
|
||||
&mut consumed,
|
||||
) {
|
||||
match decode_one_codepoint_utf8(buff, InvalidPolicy::Passthrough, &unconsumed) {
|
||||
DecodeState::Incomplete => continue,
|
||||
DecodeState::Complete => {
|
||||
unconsumed.clear();
|
||||
|
||||
173
src/common.rs
173
src/common.rs
@@ -16,9 +16,6 @@
|
||||
use crate::wchar::{decode_byte_from_char, encode_byte_to_char, prelude::*};
|
||||
use crate::wcstringutil::wcs2bytes_callback;
|
||||
use crate::wildcard::{ANY_CHAR, ANY_STRING, ANY_STRING_RECURSIVE};
|
||||
use crate::wutil::encoding::{
|
||||
AT_LEAST_MB_LEN_MAX, mbrtowc, probe_is_multibyte_locale, wcrtomb, zero_mbstate,
|
||||
};
|
||||
use crate::wutil::fish_iswalnum;
|
||||
use bitflags::bitflags;
|
||||
use libc::{SIG_IGN, SIGTTOU, STDIN_FILENO};
|
||||
@@ -190,7 +187,7 @@ fn escape_string_script(input: &wstr, flags: EscapeFlags) -> WString {
|
||||
let no_quoted = flags.contains(EscapeFlags::NO_QUOTED);
|
||||
let no_tilde = flags.contains(EscapeFlags::NO_TILDE);
|
||||
let no_qmark = feature_test(FeatureFlag::qmark_noglob);
|
||||
let symbolic = flags.contains(EscapeFlags::SYMBOLIC) && get_is_multibyte_locale();
|
||||
let symbolic = flags.contains(EscapeFlags::SYMBOLIC);
|
||||
|
||||
assert!(
|
||||
!symbolic || !escape_printables,
|
||||
@@ -1035,19 +1032,15 @@ pub fn shell_modes() -> MutexGuard<'static, libc::termios> {
|
||||
/// The character to use where the text has been truncated. Is an ellipsis on unicode system and a $
|
||||
/// on other systems.
|
||||
pub fn get_ellipsis_char() -> char {
|
||||
char::from_u32(ELLIPSIS_CHAR.load(Ordering::Relaxed)).unwrap()
|
||||
'\u{2026}'
|
||||
}
|
||||
|
||||
static ELLIPSIS_CHAR: AtomicU32 = AtomicU32::new(0);
|
||||
|
||||
/// The character or string to use where text has been truncated (ellipsis if possible, otherwise
|
||||
/// ...)
|
||||
pub fn get_ellipsis_str() -> &'static wstr {
|
||||
ELLIPSIS_STRING.load()
|
||||
L!("\u{2026}")
|
||||
}
|
||||
|
||||
static ELLIPSIS_STRING: AtomicRef<wstr> = AtomicRef::new(&L!(""));
|
||||
|
||||
/// Character representing an omitted newline at the end of text.
|
||||
pub fn get_omitted_newline_str() -> &'static wstr {
|
||||
OMITTED_NEWLINE_STR.load()
|
||||
@@ -1065,13 +1058,6 @@ pub fn get_obfuscation_read_char() -> char {
|
||||
char::from_u32(OBFUSCATION_READ_CHAR.load(Ordering::Relaxed)).unwrap()
|
||||
}
|
||||
|
||||
static IS_MB_LOCALE: RelaxedAtomicBool = RelaxedAtomicBool::new(false);
|
||||
|
||||
/// Whether we believe we are in a multibyte locale.
|
||||
pub fn get_is_multibyte_locale() -> bool {
|
||||
IS_MB_LOCALE.load()
|
||||
}
|
||||
|
||||
/// Profiling flag. True if commands should be profiled.
|
||||
pub static PROFILING_ACTIVE: RelaxedAtomicBool = RelaxedAtomicBool::new(false);
|
||||
|
||||
@@ -1102,84 +1088,54 @@ pub fn has_working_tty_timestamps() -> bool {
|
||||
/// todo!("Maybe remove the box? It is only needed for get_bg_context.")
|
||||
pub type CancelChecker = Box<dyn Fn() -> bool>;
|
||||
|
||||
/// Converts the narrow character string \c in into its wide equivalent, and return it.
|
||||
///
|
||||
/// The string may contain embedded nulls.
|
||||
///
|
||||
/// This function encodes illegal character sequences in a reversible way using the private use
|
||||
/// area.
|
||||
pub fn bytes2wcstring(inp: &[u8]) -> WString {
|
||||
if inp.is_empty() {
|
||||
/// Encodes the bytes in `input` into a [`WString`], encoding non-UTF-8 bytes into private-use-area
|
||||
/// code-points. Bytes which would be parsed into our reserved PUA range are encoded individually,
|
||||
/// to allow for correct round-tripping.
|
||||
pub fn bytes2wcstring(mut input: &[u8]) -> WString {
|
||||
if input.is_empty() {
|
||||
return WString::new();
|
||||
}
|
||||
|
||||
let mut result = WString::new();
|
||||
result.reserve(inp.len());
|
||||
let mut pos = 0;
|
||||
let mut state = zero_mbstate();
|
||||
while pos < inp.len() {
|
||||
// Append any initial sequence of ascii characters.
|
||||
// Note we do not support character sets which are not supersets of ASCII.
|
||||
let ascii_prefix_length = count_ascii_prefix(&inp[pos..]);
|
||||
result.push_str(std::str::from_utf8(&inp[pos..pos + ascii_prefix_length]).unwrap());
|
||||
pos += ascii_prefix_length;
|
||||
assert!(pos <= inp.len(), "Position overflowed length");
|
||||
if pos == inp.len() {
|
||||
break;
|
||||
}
|
||||
|
||||
// We have found a non-ASCII character.
|
||||
let mut ret = 0;
|
||||
let mut c = '\0';
|
||||
|
||||
let use_encode_direct = if inp[pos] & 0xF8 == 0xF8 {
|
||||
// Protect against broken mbrtowc() implementations which attempt to encode UTF-8
|
||||
// sequences longer than four bytes (e.g., OS X Snow Leopard).
|
||||
// TODO This check used to be conditionally compiled only on affected platforms.
|
||||
true
|
||||
} else {
|
||||
let mut codepoint = u32::from(c);
|
||||
ret = unsafe {
|
||||
mbrtowc(
|
||||
std::ptr::addr_of_mut!(codepoint),
|
||||
std::ptr::addr_of!(inp[pos]).cast(),
|
||||
inp.len() - pos,
|
||||
&mut state,
|
||||
)
|
||||
};
|
||||
match char::from_u32(codepoint) {
|
||||
Some(codepoint) => {
|
||||
c = codepoint;
|
||||
// Determine whether to encode this character with our crazy scheme.
|
||||
fish_reserved_codepoint(c)
|
||||
||
|
||||
// Incomplete sequence.
|
||||
ret == 0_usize.wrapping_sub(2)
|
||||
||
|
||||
// Invalid data.
|
||||
ret == 0_usize.wrapping_sub(1)
|
||||
||
|
||||
// Other error codes? Terrifying, should never happen.
|
||||
ret > inp.len() - pos
|
||||
fn append_escaped_str(output: &mut WString, input: &str) {
|
||||
for (i, c) in input.char_indices() {
|
||||
if fish_reserved_codepoint(c) {
|
||||
for byte in &input.as_bytes()[i..i + c.len_utf8()] {
|
||||
output.push(encode_byte_to_char(*byte));
|
||||
}
|
||||
None => true,
|
||||
} else {
|
||||
output.push(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if use_encode_direct {
|
||||
c = encode_byte_to_char(inp[pos]);
|
||||
result.push(c);
|
||||
pos += 1;
|
||||
state = zero_mbstate();
|
||||
} else if ret == 0 {
|
||||
// embedded null byte!
|
||||
result.push('\0');
|
||||
pos += 1;
|
||||
state = zero_mbstate();
|
||||
} else {
|
||||
// normal case
|
||||
result.push(c);
|
||||
pos += ret;
|
||||
while !input.is_empty() {
|
||||
match std::str::from_utf8(input) {
|
||||
Ok(parsed_str) => {
|
||||
append_escaped_str(&mut result, parsed_str);
|
||||
// The entire remaining input could be parsed, so we are done.
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
let (valid, after_valid) = input.split_at(e.valid_up_to());
|
||||
// SAFETY: The previous `str::from_utf8` call established that the prefix `valid`
|
||||
// is valid UTF-8. This prefix may be empty.
|
||||
let parsed_str = unsafe { std::str::from_utf8_unchecked(valid) };
|
||||
append_escaped_str(&mut result, parsed_str);
|
||||
// The length of the prefix of `after_valid` which is invalid UTF-8.
|
||||
// The remaining bytes of `input` (if any) will be parsed in subsequent iterations
|
||||
// of the loop, starting from the first byte that starts a valid UTF-8-encoded codepoint.
|
||||
// `error_len` can return `None`, if it sees a byte sequence that could be the
|
||||
// prefix of a valid code-point encoding at the end of the byte slice.
|
||||
// This is useful when the input is chunked, but we don't do that, so in this case
|
||||
// we use our custom encoding for all remaining bytes (at most 3).
|
||||
let error_len = e.error_len().unwrap_or(after_valid.len());
|
||||
for byte in &after_valid[..error_len] {
|
||||
result.push(encode_byte_to_char(*byte));
|
||||
}
|
||||
input = &after_valid[error_len..];
|
||||
}
|
||||
}
|
||||
}
|
||||
result
|
||||
@@ -1265,12 +1221,6 @@ pub fn wcs2bytes_appending(output: &mut Vec<u8>, input: &wstr) {
|
||||
});
|
||||
}
|
||||
|
||||
/// Return the count of initial characters in `in` which are ASCII.
|
||||
fn count_ascii_prefix(inp: &[u8]) -> usize {
|
||||
// The C++ version had manual vectorization.
|
||||
inp.iter().take_while(|c| c.is_ascii()).count()
|
||||
}
|
||||
|
||||
// Check if we are running in the test mode, where we should suppress error output
|
||||
pub const TESTS_PROGRAM_NAME: &wstr = L!("(ignore)");
|
||||
|
||||
@@ -1302,22 +1252,6 @@ macro_rules! LL {
|
||||
}};
|
||||
}
|
||||
|
||||
// Mark if we are a multibyte locale.
|
||||
IS_MB_LOCALE.store(probe_is_multibyte_locale());
|
||||
|
||||
// Use various Unicode symbols if they can be encoded using the current locale, else a simple
|
||||
// ASCII char alternative. All of the can_be_encoded() invocations should return the same
|
||||
// true/false value since the code points are in the BMP but we're going to be paranoid. This
|
||||
// is also technically wrong if we're not in a Unicode locale but we expect (or hope)
|
||||
// can_be_encoded() will return false in that case.
|
||||
if can_be_encoded('\u{2026}') {
|
||||
ELLIPSIS_CHAR.store(u32::from('\u{2026}'), Ordering::Relaxed);
|
||||
ELLIPSIS_STRING.store(LL!("\u{2026}"));
|
||||
} else {
|
||||
ELLIPSIS_CHAR.store(u32::from('$'), Ordering::Relaxed); // "horizontal ellipsis"
|
||||
ELLIPSIS_STRING.store(LL!("..."));
|
||||
}
|
||||
|
||||
if is_windows_subsystem_for_linux(WSL::Any) {
|
||||
// neither of \u23CE and \u25CF can be displayed in the default fonts on Windows, though
|
||||
// they can be *encoded* just fine. Use alternative glyphs.
|
||||
@@ -1327,29 +1261,16 @@ macro_rules! LL {
|
||||
OMITTED_NEWLINE_STR.store(LL!("^J"));
|
||||
OBFUSCATION_READ_CHAR.store(u32::from('*'), Ordering::Relaxed);
|
||||
} else {
|
||||
if can_be_encoded('\u{23CE}') {
|
||||
OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎)
|
||||
} else {
|
||||
OMITTED_NEWLINE_STR.store(LL!("^J"));
|
||||
}
|
||||
OMITTED_NEWLINE_STR.store(LL!("\u{23CE}")); // "return symbol" (⏎)
|
||||
OBFUSCATION_READ_CHAR.store(
|
||||
u32::from(if can_be_encoded('\u{25CF}') {
|
||||
'\u{25CF}' // "black circle"
|
||||
} else {
|
||||
'#'
|
||||
}),
|
||||
u32::from(
|
||||
'\u{25CF}', // "black circle"
|
||||
),
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test if the character can be encoded using the current locale.
|
||||
fn can_be_encoded(wc: char) -> bool {
|
||||
let mut converted = [0 as libc::c_char; AT_LEAST_MB_LEN_MAX];
|
||||
let mut state = zero_mbstate();
|
||||
unsafe { wcrtomb(converted.as_mut_ptr(), wc as u32, &mut state) != 0_usize.wrapping_sub(1) }
|
||||
}
|
||||
|
||||
/// Call read, blocking and repeating on EINTR. Exits on EAGAIN.
|
||||
/// Return the number of bytes read, or 0 on EOF, or an error.
|
||||
pub fn read_blocked(fd: RawFd, buf: &mut [u8]) -> nix::Result<usize> {
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
use crate::terminal::use_terminfo;
|
||||
use crate::tty_handoff::xtversion;
|
||||
use crate::wchar::prelude::*;
|
||||
use crate::wutil::encoding::probe_is_multibyte_locale;
|
||||
use crate::wutil::fish_wcstoi;
|
||||
use crate::{function, terminal};
|
||||
use std::borrow::Cow;
|
||||
@@ -25,12 +24,16 @@
|
||||
|
||||
/// List of all locale environment variable names that might trigger (re)initializing of the locale
|
||||
/// subsystem. These are only the variables we're possibly interested in.
|
||||
#[rustfmt::skip]
|
||||
const LOCALE_VARIABLES: [&wstr; 10] = [
|
||||
L!("LANG"), L!("LANGUAGE"), L!("LC_ALL"),
|
||||
L!("LC_COLLATE"), L!("LC_CTYPE"), L!("LC_MESSAGES"),
|
||||
L!("LC_NUMERIC"), L!("LC_TIME"), L!("LOCPATH"),
|
||||
L!("fish_allow_singlebyte_locale"),
|
||||
const LOCALE_VARIABLES: [&wstr; 9] = [
|
||||
L!("LANG"),
|
||||
L!("LANGUAGE"),
|
||||
L!("LC_ALL"),
|
||||
L!("LC_COLLATE"),
|
||||
L!("LC_CTYPE"),
|
||||
L!("LC_MESSAGES"),
|
||||
L!("LC_NUMERIC"),
|
||||
L!("LC_TIME"),
|
||||
L!("LOCPATH"),
|
||||
];
|
||||
|
||||
#[rustfmt::skip]
|
||||
@@ -299,8 +302,6 @@ fn handle_tz_change(var_name: &wstr, vars: &EnvStack) {
|
||||
|
||||
fn handle_locale_change(vars: &EnvStack) {
|
||||
init_locale(vars);
|
||||
// We need to re-guess emoji width because the locale might have changed to a multibyte one.
|
||||
guess_emoji_width(vars);
|
||||
}
|
||||
|
||||
fn handle_term_change(vars: &EnvStack) {
|
||||
@@ -502,11 +503,6 @@ pub fn read_terminfo_database(vars: &EnvStack) {
|
||||
fn init_locale(vars: &EnvStack) {
|
||||
let _guard = crate::locale::LOCALE_LOCK.lock().unwrap();
|
||||
|
||||
#[rustfmt::skip]
|
||||
const UTF8_LOCALES: &[&str] = &[
|
||||
"C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8",
|
||||
];
|
||||
|
||||
let old_msg_locale: CString = {
|
||||
let old = unsafe { libc::setlocale(libc::LC_MESSAGES, ptr::null()) };
|
||||
assert_ne!(old, ptr::null_mut());
|
||||
@@ -541,33 +537,6 @@ fn init_locale(vars: &EnvStack) {
|
||||
}
|
||||
};
|
||||
|
||||
// Try to get a multibyte-capable encoding.
|
||||
// A "C" locale is broken for our purposes: any wchar function will break on it. So we try
|
||||
// *really, really, really hard* to not have one.
|
||||
let fix_locale = vars
|
||||
.get_unless_empty(L!("fish_allow_singlebyte_locale"))
|
||||
.map(|v| v.as_string())
|
||||
.map(|allow_c| !crate::wcstringutil::bool_from_string(&allow_c))
|
||||
.unwrap_or(true);
|
||||
|
||||
if fix_locale && !probe_is_multibyte_locale() {
|
||||
FLOG!(env_locale, "Have single byte locale, trying to fix.");
|
||||
let mut fixed = false;
|
||||
for locale in UTF8_LOCALES {
|
||||
let locale_cstr = CString::new(*locale).unwrap();
|
||||
// this can fail, that is fine
|
||||
unsafe { libc::setlocale(libc::LC_CTYPE, locale_cstr.as_ptr()) };
|
||||
if probe_is_multibyte_locale() {
|
||||
FLOG!(env_locale, "Fixed locale:", locale);
|
||||
fixed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !fixed {
|
||||
FLOG!(env_locale, "Failed to fix locale.");
|
||||
}
|
||||
}
|
||||
|
||||
// We *always* use a C-locale for numbers because we want '.' (except for in printf).
|
||||
let loc_ptr = unsafe { libc::setlocale(libc::LC_NUMERIC, c"C".as_ptr().cast()) };
|
||||
// should never fail, the C locale should always be defined
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::common::{
|
||||
WSL, bytes2wcstring, fish_reserved_codepoint, get_is_multibyte_locale,
|
||||
is_windows_subsystem_for_linux, read_blocked, shell_modes,
|
||||
WSL, bytes2wcstring, fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked,
|
||||
shell_modes,
|
||||
};
|
||||
use crate::env::{EnvStack, Environment};
|
||||
use crate::fd_readable_set::{FdReadableSet, Timeout};
|
||||
@@ -17,7 +17,6 @@
|
||||
};
|
||||
use crate::universal_notifier::default_notifier;
|
||||
use crate::wchar::{encode_byte_to_char, prelude::*};
|
||||
use crate::wutil::encoding::{mbrtowc, mbstate_t, zero_mbstate};
|
||||
use crate::wutil::{fish_is_pua, fish_wcstol};
|
||||
use std::cell::{RefCell, RefMut};
|
||||
use std::collections::VecDeque;
|
||||
@@ -826,9 +825,7 @@ fn try_pop(&mut self) -> Option<CharEvent> {
|
||||
self.get_input_data_mut().queue.pop_front()
|
||||
}
|
||||
|
||||
/// Function used by [`readch`](Self::readch) to read bytes from stdin until enough bytes have been read to
|
||||
/// convert them to a wchar_t. Conversion is done using mbrtowc. If a character has previously
|
||||
/// been read and then 'unread' using \c input_common_unreadch, that character is returned.
|
||||
/// Read the next event, such as a UTF-8-encoded codepoint.
|
||||
fn readch(&mut self) -> CharEvent {
|
||||
loop {
|
||||
// Do we have something enqueued already?
|
||||
@@ -871,7 +868,7 @@ fn readch(&mut self) -> CharEvent {
|
||||
InputEventTrigger::Byte(read_byte) => {
|
||||
let mut have_escape_prefix = false;
|
||||
let mut buffer = vec![read_byte];
|
||||
let key_with_escape = if read_byte == 0x1b {
|
||||
let mut key = if read_byte == 0x1b {
|
||||
self.parse_escape_sequence(&mut buffer, &mut have_escape_prefix)
|
||||
} else {
|
||||
canonicalize_control_char(read_byte).map(KeyEvent::from)
|
||||
@@ -883,47 +880,35 @@ fn readch(&mut self) -> CharEvent {
|
||||
continue;
|
||||
}
|
||||
let mut seq = WString::new();
|
||||
let mut key = key_with_escape;
|
||||
if key.is_some_and(|key| key.key == Key::from_raw(key::Invalid)) {
|
||||
continue;
|
||||
}
|
||||
assert!(key.is_none_or(|key| key.codepoint != key::Invalid));
|
||||
let mut consumed = 0;
|
||||
let mut state = zero_mbstate();
|
||||
let mut i = 0;
|
||||
// At this point, the bytes in `buffer` should be parsed as a UTF-8 sequence,
|
||||
// or, if they are not valid UTF-8, ignored. On incomplete sequences, another
|
||||
// byte is read and decoding is tried again in the next iteration.
|
||||
let ok = loop {
|
||||
if i == buffer.len() {
|
||||
buffer.push(
|
||||
match next_input_event(self.get_in_fd(), Timeout::Forever) {
|
||||
InputEventTrigger::Byte(b) => b,
|
||||
_ => 0,
|
||||
},
|
||||
);
|
||||
}
|
||||
match decode_input_byte(
|
||||
&mut seq,
|
||||
InvalidPolicy::Error,
|
||||
&mut state,
|
||||
&buffer[..i + 1],
|
||||
&mut consumed,
|
||||
) {
|
||||
DecodeState::Incomplete => (),
|
||||
match decode_one_codepoint_utf8(&mut seq, InvalidPolicy::Error, &buffer) {
|
||||
DecodeState::Incomplete => {
|
||||
buffer.push(
|
||||
match next_input_event(self.get_in_fd(), Timeout::Forever) {
|
||||
InputEventTrigger::Byte(b) => b,
|
||||
_ => 0,
|
||||
},
|
||||
);
|
||||
}
|
||||
DecodeState::Complete => {
|
||||
if have_escape_prefix && i != 0 {
|
||||
have_escape_prefix = false;
|
||||
if have_escape_prefix {
|
||||
let c = seq.as_char_slice().last().unwrap();
|
||||
key = Some(KeyEvent::from(alt(*c)));
|
||||
}
|
||||
if i + 1 == buffer.len() {
|
||||
break true;
|
||||
}
|
||||
break true;
|
||||
}
|
||||
DecodeState::Error => {
|
||||
self.push_front(CharEvent::from_check_exit());
|
||||
break false;
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
};
|
||||
if !ok {
|
||||
continue;
|
||||
@@ -1686,63 +1671,37 @@ pub(crate) enum InvalidPolicy {
|
||||
Passthrough,
|
||||
}
|
||||
|
||||
pub(crate) fn decode_input_byte(
|
||||
pub(crate) fn decode_one_codepoint_utf8(
|
||||
out_seq: &mut WString,
|
||||
invalid_policy: InvalidPolicy,
|
||||
state: &mut mbstate_t,
|
||||
buffer: &[u8],
|
||||
consumed: &mut usize,
|
||||
) -> DecodeState {
|
||||
use DecodeState::*;
|
||||
let mut res: char = '\0';
|
||||
let read_byte = *buffer.last().unwrap();
|
||||
if !get_is_multibyte_locale() {
|
||||
// single-byte locale, all values are legal
|
||||
res = read_byte.into();
|
||||
out_seq.push(res);
|
||||
return Complete;
|
||||
}
|
||||
let mut invalid = |out_seq: &mut WString, log_error: fn()| match invalid_policy {
|
||||
InvalidPolicy::Error => {
|
||||
(log_error)();
|
||||
Error
|
||||
}
|
||||
InvalidPolicy::Passthrough => {
|
||||
for &b in &buffer[*consumed..] {
|
||||
out_seq.push(encode_byte_to_char(b));
|
||||
match std::str::from_utf8(buffer) {
|
||||
Ok(parsed_str) => {
|
||||
for c in parsed_str.chars() {
|
||||
if !fish_reserved_codepoint(c) {
|
||||
out_seq.push(c);
|
||||
}
|
||||
}
|
||||
*consumed = buffer.len();
|
||||
Complete
|
||||
}
|
||||
};
|
||||
let mut codepoint = u32::from(res);
|
||||
match unsafe {
|
||||
mbrtowc(
|
||||
std::ptr::addr_of_mut!(codepoint),
|
||||
std::ptr::addr_of!(read_byte).cast(),
|
||||
1,
|
||||
state,
|
||||
)
|
||||
} as isize
|
||||
{
|
||||
-1 => {
|
||||
return invalid(out_seq, || FLOG!(reader, "Illegal input encoding"));
|
||||
}
|
||||
-2 => {
|
||||
// Sequence not yet complete.
|
||||
return Incomplete;
|
||||
}
|
||||
_ => (),
|
||||
Err(e) => match e.error_len() {
|
||||
Some(_) => match invalid_policy {
|
||||
InvalidPolicy::Error => {
|
||||
FLOG!(reader, "Illegal input encoding");
|
||||
Error
|
||||
}
|
||||
InvalidPolicy::Passthrough => {
|
||||
for &b in buffer {
|
||||
out_seq.push(encode_byte_to_char(b));
|
||||
}
|
||||
Complete
|
||||
}
|
||||
},
|
||||
None => Incomplete,
|
||||
},
|
||||
}
|
||||
if let Some(res) = char::from_u32(codepoint) {
|
||||
// Sequence complete.
|
||||
if !fish_reserved_codepoint(res) {
|
||||
*consumed += 1;
|
||||
out_seq.push(res);
|
||||
return Complete;
|
||||
}
|
||||
}
|
||||
invalid(out_seq, || FLOG!(reader, "Illegal codepoint"))
|
||||
}
|
||||
|
||||
pub(crate) fn stop_query(mut query: RefMut<'_, Option<TerminalQuery>>) -> bool {
|
||||
|
||||
24
src/pager.rs
24
src/pager.rs
@@ -5,7 +5,6 @@
|
||||
|
||||
use crate::common::{
|
||||
EscapeFlags, EscapeStringStyle, escape_string, get_ellipsis_char, get_ellipsis_str,
|
||||
get_is_multibyte_locale,
|
||||
};
|
||||
use crate::complete::Completion;
|
||||
use crate::editable_line::EditableLine;
|
||||
@@ -1231,19 +1230,16 @@ fn process_completions_into_infos(lst: &[Completion]) -> Vec<PagerComp> {
|
||||
EscapeFlags::NO_PRINTABLES | EscapeFlags::NO_QUOTED | EscapeFlags::SYMBOLIC,
|
||||
),
|
||||
));
|
||||
if comp.replaces_line()
|
||||
// HACK We want to render a full shell command, with syntax highlighting. Above we
|
||||
// escape nonprintables, which might make the rendered command longer than the original
|
||||
// completion. In that case we get wrong colors. However this should only happen in
|
||||
// contrived cases, since our symbolic escaping uses a single character to represent
|
||||
// newline and tab characters; other nonprintables are extremely rare in a command
|
||||
// line. It will only be common for single-byte locales where we don't
|
||||
// use Unicode characters for escaping, so just disable those here.
|
||||
// We should probably fix this by first highlighting the original completion, and
|
||||
// then writing a variant of escape_string() that adjusts highlighting according
|
||||
// so it matches the escaped string.
|
||||
&& get_is_multibyte_locale()
|
||||
{
|
||||
// HACK We want to render a full shell command, with syntax highlighting. Above we
|
||||
// escape nonprintables, which might make the rendered command longer than the original
|
||||
// completion. In that case we get wrong colors. However this should only happen in
|
||||
// contrived cases, since our symbolic escaping uses a single character to represent
|
||||
// newline and tab characters; other nonprintables are extremely rare in a command
|
||||
// line.
|
||||
// We should probably fix this by first highlighting the original completion, and
|
||||
// then writing a variant of escape_string() that adjusts highlighting according
|
||||
// so it matches the escaped string.
|
||||
if comp.replaces_line() {
|
||||
highlight_shell(
|
||||
&comp.completion,
|
||||
&mut comp_info.colors,
|
||||
|
||||
@@ -55,9 +55,8 @@
|
||||
use crate::common::ScopeGuarding;
|
||||
use crate::common::{
|
||||
EscapeFlags, EscapeStringStyle, PROGRAM_NAME, ScopeGuard, UTF8_BOM_WCHAR, bytes2wcstring,
|
||||
escape, escape_string, exit_without_destructors, get_ellipsis_char, get_is_multibyte_locale,
|
||||
get_obfuscation_read_char, restore_term_foreground_process_group_for_exit, shell_modes,
|
||||
write_loop,
|
||||
escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char,
|
||||
restore_term_foreground_process_group_for_exit, shell_modes, write_loop,
|
||||
};
|
||||
use crate::complete::{
|
||||
CompleteFlags, Completion, CompletionList, CompletionRequestOptions, complete, complete_load,
|
||||
@@ -3220,14 +3219,7 @@ fn handle_readline_command(&mut self, c: ReadlineCmd) {
|
||||
self.history_pager = Some(0..1);
|
||||
// Update the pager data.
|
||||
self.pager.set_search_field_shown(true);
|
||||
self.pager.set_prefix(
|
||||
if get_is_multibyte_locale() {
|
||||
L!("► ")
|
||||
} else {
|
||||
L!("> ")
|
||||
},
|
||||
/*highlight=*/ false,
|
||||
);
|
||||
self.pager.set_prefix(L!("► "), false);
|
||||
// Update the search field, which triggers the actual history search.
|
||||
let search_string = if !self.history_search.active()
|
||||
|| self.history_search.search_string().is_empty()
|
||||
|
||||
@@ -1,40 +1,11 @@
|
||||
use std::sync::MutexGuard;
|
||||
|
||||
use crate::common::{
|
||||
ENCODE_DIRECT_BASE, ENCODE_DIRECT_END, EscapeFlags, EscapeStringStyle, UnescapeStringStyle,
|
||||
bytes2wcstring, escape_string, fish_setlocale, unescape_string, wcs2bytes,
|
||||
bytes2wcstring, escape_string, unescape_string, wcs2bytes,
|
||||
};
|
||||
use crate::locale::LOCALE_LOCK;
|
||||
use crate::util::{get_rng_seed, get_seeded_rng};
|
||||
use crate::wchar::{L, WString, wstr};
|
||||
use crate::wutil::encoding::{
|
||||
AT_LEAST_MB_LEN_MAX, probe_is_multibyte_locale, wcrtomb, zero_mbstate,
|
||||
};
|
||||
use rand::{Rng, RngCore};
|
||||
|
||||
/// wcs2bytes is locale-dependent, so ensure we have a multibyte locale
|
||||
/// before using it in a test.
|
||||
fn setlocale() -> MutexGuard<'static, ()> {
|
||||
let guard = LOCALE_LOCK.lock().unwrap();
|
||||
|
||||
#[rustfmt::skip]
|
||||
const UTF8_LOCALES: &[&str] = &[
|
||||
"C.UTF-8", "en_US.UTF-8", "en_GB.UTF-8", "de_DE.UTF-8", "C.utf8", "UTF-8",
|
||||
];
|
||||
if probe_is_multibyte_locale() {
|
||||
return guard;
|
||||
}
|
||||
for locale in UTF8_LOCALES {
|
||||
let locale = std::ffi::CString::new(locale.to_owned()).unwrap();
|
||||
unsafe { libc::setlocale(libc::LC_CTYPE, locale.as_ptr()) };
|
||||
if probe_is_multibyte_locale() {
|
||||
fish_setlocale(); // Update cached locale information.
|
||||
return guard;
|
||||
}
|
||||
}
|
||||
panic!("No UTF-8 locale found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape_string() {
|
||||
let regex = |input| escape_string(input, EscapeStringStyle::Regex);
|
||||
@@ -105,7 +76,6 @@ fn test_escape_var() {
|
||||
}
|
||||
|
||||
fn escape_test(escape_style: EscapeStringStyle, unescape_style: UnescapeStringStyle) {
|
||||
let _locale_guard = setlocale();
|
||||
let seed: u128 = 92348567983274852905629743984572;
|
||||
let mut rng = get_seeded_rng(seed);
|
||||
|
||||
@@ -185,7 +155,6 @@ fn bytes2hex(input: &[u8]) -> String {
|
||||
/// string comes back through double conversion.
|
||||
#[test]
|
||||
fn test_convert() {
|
||||
let _locale_guard = setlocale();
|
||||
let seed = get_rng_seed();
|
||||
let mut rng = get_seeded_rng(seed);
|
||||
let mut origin = Vec::new();
|
||||
@@ -241,30 +210,18 @@ fn test_convert_ascii() {
|
||||
}
|
||||
}
|
||||
|
||||
/// fish uses the private-use range to encode bytes that could not be decoded using the
|
||||
/// user's locale. If the input could be decoded, but decoded to private-use codepoints,
|
||||
/// then fish should also use the direct encoding for those bytes. Verify that characters
|
||||
/// in the private use area are correctly round-tripped. See #7723.
|
||||
/// fish uses the private-use range to encode bytes that are not valid UTF-8.
|
||||
/// If the input decodes to these private-use codepoints,
|
||||
/// then fish should also use the direct encoding for those bytes.
|
||||
/// Verify that characters in the private use area are correctly round-tripped. See #7723.
|
||||
#[test]
|
||||
fn test_convert_private_use() {
|
||||
for c in ENCODE_DIRECT_BASE..ENCODE_DIRECT_END {
|
||||
// Encode the char via the locale. Do not use fish functions which interpret these
|
||||
// specially.
|
||||
let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX];
|
||||
let mut state = zero_mbstate();
|
||||
let len = unsafe {
|
||||
wcrtomb(
|
||||
std::ptr::addr_of_mut!(converted[0]).cast(),
|
||||
c as u32,
|
||||
&mut state,
|
||||
)
|
||||
};
|
||||
if len == 0_usize.wrapping_sub(1) {
|
||||
// Could not be encoded in this locale.
|
||||
continue;
|
||||
}
|
||||
let s = &converted[..len];
|
||||
|
||||
// A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
|
||||
// TODO MSRV(1.92?) replace 4 by `char::MAX_LEN_UTF8` once that's available in our MSRV.
|
||||
// https://doc.rust-lang.org/std/primitive.char.html#associatedconstant.MAX_LEN_UTF8
|
||||
let mut converted = [0_u8; 4];
|
||||
let s = c.encode_utf8(&mut converted).as_bytes();
|
||||
// Ask fish to decode this via bytes2wcstring.
|
||||
// bytes2wcstring should notice that the decoded form collides with its private use
|
||||
// and encode it directly.
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
//! Helper functions for working with wcstring.
|
||||
|
||||
use crate::common::{get_ellipsis_char, get_ellipsis_str, get_is_multibyte_locale};
|
||||
use crate::common::{get_ellipsis_char, get_ellipsis_str};
|
||||
use crate::fallback::{fish_wcwidth, wcscasecmp, wcscasecmp_fuzzy};
|
||||
use crate::flog::FLOGF;
|
||||
use crate::wchar::{decode_byte_from_char, prelude::*};
|
||||
use crate::wutil::encoding::{AT_LEAST_MB_LEN_MAX, wcrtomb, zero_mbstate};
|
||||
|
||||
/// Return the number of newlines in a string.
|
||||
pub fn count_newlines(s: &wstr) -> usize {
|
||||
@@ -299,56 +297,28 @@ pub fn string_fuzzy_match_string(
|
||||
}
|
||||
|
||||
/// Implementation of wcs2bytes that accepts a callback.
|
||||
/// This invokes `func` with (const char*, size_t) pairs.
|
||||
/// This invokes `func` with byte slices containing the UTF-8 encoding of the characters in the
|
||||
/// input, doing one invocation per character.
|
||||
/// If `func` returns false, it stops; otherwise it continues.
|
||||
/// Return false if the callback returned false, otherwise true.
|
||||
pub fn wcs2bytes_callback(input: &wstr, mut func: impl FnMut(&[u8]) -> bool) -> bool {
|
||||
let mut state = zero_mbstate();
|
||||
let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX];
|
||||
|
||||
let is_singlebyte_locale = !get_is_multibyte_locale();
|
||||
// A `char` represents an Unicode scalar value, which takes up at most 4 bytes when encoded in UTF-8.
|
||||
let mut converted = [0_u8; 4];
|
||||
|
||||
for c in input.chars() {
|
||||
if let Some(byte) = decode_byte_from_char(c) {
|
||||
let bytes = if let Some(byte) = decode_byte_from_char(c) {
|
||||
converted[0] = byte;
|
||||
if !func(&converted[..1]) {
|
||||
return false;
|
||||
}
|
||||
} else if is_singlebyte_locale {
|
||||
// single-byte locale (C/POSIX/ISO-8859)
|
||||
// If `c` contains a wide character we emit a question-mark.
|
||||
converted[0] = u8::try_from(u32::from(c)).unwrap_or(b'?');
|
||||
if !func(&converted[..1]) {
|
||||
return false;
|
||||
}
|
||||
&converted[..=0]
|
||||
} else {
|
||||
converted = [0; AT_LEAST_MB_LEN_MAX];
|
||||
let len = unsafe {
|
||||
wcrtomb(
|
||||
std::ptr::addr_of_mut!(converted[0]).cast(),
|
||||
c as u32,
|
||||
&mut state,
|
||||
)
|
||||
};
|
||||
if len == 0_usize.wrapping_sub(1) {
|
||||
wcs2bytes_bad_char(c);
|
||||
state = zero_mbstate();
|
||||
} else if !func(&converted[..len]) {
|
||||
return false;
|
||||
}
|
||||
c.encode_utf8(&mut converted).as_bytes()
|
||||
};
|
||||
if !func(bytes) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn wcs2bytes_bad_char(c: char) {
|
||||
FLOGF!(
|
||||
char_encoding,
|
||||
L!("Wide character U+%4X has no narrow representation"),
|
||||
c
|
||||
);
|
||||
}
|
||||
|
||||
/// Split a string by runs of any of the separator characters provided in `seps`.
|
||||
/// Note the delimiters are the characters in `seps`, not `seps` itself.
|
||||
/// `seps` may contain the NUL character.
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
extern "C" {
|
||||
#[cfg_attr(cygwin, link_name = "c32rtomb")]
|
||||
pub fn wcrtomb(s: *mut libc::c_char, wc: u32, ps: *mut mbstate_t) -> usize;
|
||||
#[cfg_attr(cygwin, link_name = "mbrtoc32")]
|
||||
pub fn mbrtowc(pwc: *mut u32, s: *const libc::c_char, n: usize, p: *mut mbstate_t) -> usize;
|
||||
}
|
||||
|
||||
// HACK This should be mbstate_t from libc but that's not exposed. Since it's only written by
|
||||
// libc, we define it as opaque type that should be large enough for all implementations.
|
||||
pub type mbstate_t = [u64; 16];
|
||||
|
||||
#[inline]
|
||||
pub fn zero_mbstate() -> mbstate_t {
|
||||
[0; 16]
|
||||
}
|
||||
|
||||
// HACK This should be the MB_LEN_MAX macro from libc but that's not easy to get.
|
||||
pub const AT_LEAST_MB_LEN_MAX: usize = 32;
|
||||
|
||||
/// Return true if we believe we are in a multibyte locale.
|
||||
/// Note this reads the current locale and is modestly expensive - prefer the cached
|
||||
/// values in `common.rs` which is set by `fish_setlocale`.
|
||||
pub fn probe_is_multibyte_locale() -> bool {
|
||||
// In general we would like to read MB_CUR_MAX, but that is not exposed by Rust libc.
|
||||
// Instead, check if mbrtowc for any byte in the range 0-255 returns (size_t)(-2) which indicates
|
||||
// the presence of a multibyte locale.
|
||||
#[inline]
|
||||
fn is_mb_lead(b: u8) -> bool {
|
||||
let mut st = zero_mbstate();
|
||||
let mut wc: libc::wchar_t = 0;
|
||||
let c = b as libc::c_char;
|
||||
let n = unsafe {
|
||||
mbrtowc(
|
||||
std::ptr::addr_of_mut!(wc).cast::<u32>(),
|
||||
std::ptr::addr_of!(c),
|
||||
1,
|
||||
std::ptr::addr_of_mut!(st),
|
||||
)
|
||||
};
|
||||
n == (-2_i64 as libc::size_t)
|
||||
}
|
||||
|
||||
// Fast path: check common lead bytes.
|
||||
if is_mb_lead(0xE0) || is_mb_lead(0xC2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Scan non-ASCII high bytes.
|
||||
(0x80_u8..=0xFF).any(is_mb_lead)
|
||||
}
|
||||
@@ -1,5 +1,4 @@
|
||||
pub mod dir_iter;
|
||||
pub mod encoding;
|
||||
pub mod errors;
|
||||
pub mod fileid;
|
||||
pub mod gettext;
|
||||
|
||||
@@ -3,10 +3,6 @@
|
||||
# see #7934.
|
||||
#REQUIRES: test -z "$GITHUB_WORKFLOW"
|
||||
|
||||
# We typically try to force a utf8-capable locale,
|
||||
# this turns that off.
|
||||
set -gx fish_allow_singlebyte_locale 1
|
||||
|
||||
# A function to display bytes, necessary because GNU and BSD implementations of `od` have different output.
|
||||
# We used to use xxd, but it's not available everywhere. See #3797.
|
||||
#
|
||||
@@ -22,73 +18,12 @@ echo -n A\u00FCA | display_bytes
|
||||
#CHECK: 0000000 101 303 274 101
|
||||
#CHECK: 0000004
|
||||
|
||||
# Verify that exporting a change to the C locale produces the expected output.
|
||||
# The output should include the literal byte \xFC rather than the UTF-8 sequence for \u00FC.
|
||||
begin
|
||||
set -lx LC_ALL C
|
||||
echo -n B\u00FCB | display_bytes
|
||||
end
|
||||
#CHECK: 0000000 102 374 102
|
||||
#CHECK: 0000003
|
||||
|
||||
# Since the previous change was localized to a block it should no
|
||||
# longer be in effect and we should be back to a UTF-8 locale.
|
||||
echo -n C\u00FCC | display_bytes
|
||||
#CHECK: 0000000 103 303 274 103
|
||||
#CHECK: 0000004
|
||||
|
||||
# Verify that setting a non-exported locale var doesn't affect the behavior.
|
||||
# The output should include the UTF-8 sequence for \u00FC rather than that literal byte.
|
||||
# Just like the previous test.
|
||||
begin
|
||||
set -l LC_ALL C
|
||||
echo -n D\u00FCD | display_bytes
|
||||
end
|
||||
#CHECK: 0000000 104 303 274 104
|
||||
#CHECK: 0000004
|
||||
|
||||
# Verify that fish can pass through non-ASCII characters in the C/POSIX
|
||||
# locale. This is to prevent regression of
|
||||
# https://github.com/fish-shell/fish-shell/issues/2802.
|
||||
#
|
||||
# These tests are needed because the relevant standards allow the functions
|
||||
# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
|
||||
# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
|
||||
# Other libc implementations (e.g., BSD) treat them as valid. We want fish to
|
||||
# always treat those bytes as valid.
|
||||
|
||||
# The fish in the middle of the pipeline should be receiving a UTF-8 encoded
|
||||
# version of the unicode from the echo. It should pass those bytes thru
|
||||
# literally since it is in the C locale. We verify this by first passing the
|
||||
# echo output directly to the `xxd` program then via a fish instance. The
|
||||
# output should be "58c3bb58" for the first statement and "58c3bc58" for the
|
||||
# second.
|
||||
echo -n X\u00FBX | display_bytes
|
||||
echo X\u00FCX | env LC_ALL=C $fish -c 'read foo; echo -n $foo' | display_bytes
|
||||
#CHECK: 0000000 130 303 273 130
|
||||
#CHECK: 0000004
|
||||
#CHECK: 0000000 130 303 274 130
|
||||
#CHECK: 0000004
|
||||
|
||||
# The next tests deliberately spawn another fish instance to test inheritance of env vars.
|
||||
|
||||
# This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
|
||||
# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
|
||||
# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
|
||||
# few single-byte unicode chars (that are not ASCII) are generally in the
|
||||
# ISO 8859-x char sets which are encompassed by the C locale. The output should
|
||||
# be "59fc59".
|
||||
env LC_ALL=C $fish -c 'echo -n Y\u00FCY' | display_bytes
|
||||
#CHECK: 0000000 131 374 131
|
||||
#CHECK: 0000003
|
||||
|
||||
# The user can specify a wide unicode character (one requiring more than a
|
||||
# single byte). In the C/POSIX locales we substitute a question-mark for the
|
||||
# unencodable wide char. The output should be "543f54".
|
||||
env LC_ALL=C $fish -c 'echo -n T\u01FDT' | display_bytes
|
||||
#CHECK: 0000000 124 077 124
|
||||
#CHECK: 0000003
|
||||
|
||||
string match ö \Xc3\Xb6
|
||||
#CHECK: ö
|
||||
|
||||
|
||||
Reference in New Issue
Block a user