From 028429239274aeb59fdd6a4a4fddcbf5bb69126c Mon Sep 17 00:00:00 2001 From: Johannes Altmanninger Date: Tue, 15 Apr 2025 09:21:11 +0200 Subject: [PATCH] builtin read to pass through invalid UTF-8; reader to ignore invalid codepoints Two issues: 1. typing the codepoint 0x123456 into fish_key_reader: $ fish_key_reader -cV # decoded from: \xf4\xa3\x91 bind \xf4 'do something' # decoded from: bind \xa3 'do something' # decoded from: bind \x91 'do something' The invalid codepoint is represented in its original encoding, which leaks to the UI. This was more or less intentionally added by b77d1d0e2be (Stop crashing on invalid Unicode input, 2024-02-27). That commit rendered it as replacement byte, but that was removed for other reasons in e25a1358e67 (Work around broken rendering of pasted multibyte chars in non-UTF-8-ish locale, 2024-08-03). We no longer insert such (PUA) codepoints into the commandline. The "bind" comes above would work however. I don't think this is something we want to support. Discard invalid codepoints in the reader, so they can't be bound and fish_key_reader shows nothing. 2. builtin read silently drops invalid encodings This builtin is not really suited to read binary data (#11383 is an error scenario), but I guess it can be bent to do that. Some of its code paths use str2wcstring which passes through e.g. invalid UTF-8. The read-one-char-at-a-time code path doesn't. Fix this. --- src/builtins/read.rs | 14 +++++++++----- src/input_common.rs | 31 +++++++++++++++++++++++-------- tests/checks/read.fish | 4 ++++ 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/builtins/read.rs b/src/builtins/read.rs index 5a53f3dc3..b386cc8f4 100644 --- a/src/builtins/read.rs +++ b/src/builtins/read.rs @@ -14,6 +14,7 @@ use crate::input_common::decode_input_byte; use crate::input_common::terminal_protocols_disable_ifn; use crate::input_common::DecodeState; +use crate::input_common::InvalidPolicy; use crate::nix::isatty; use crate::reader::commandline_set_buffer; use crate::reader::ReaderConfig; @@ -358,16 +359,19 @@ fn read_one_char_at_a_time( unconsumed.push(b); nbytes += 1; let mut consumed = 0; - match decode_input_byte(buff, &mut state, &unconsumed, &mut consumed) { + match decode_input_byte( + buff, + InvalidPolicy::Passthrough, + &mut state, + &unconsumed, + &mut consumed, + ) { DecodeState::Incomplete => continue, DecodeState::Complete => { unconsumed.clear(); break Some(buff.as_char_slice().last().unwrap()); } - DecodeState::Error => { - state = zero_mbstate(); - unconsumed.clear(); - } + DecodeState::Error => unreachable!(), } }; diff --git a/src/input_common.rs b/src/input_common.rs index 0a24110c3..b614250f4 100644 --- a/src/input_common.rs +++ b/src/input_common.rs @@ -873,6 +873,7 @@ fn try_readch(&mut self, blocking: bool) -> Option { } match decode_input_byte( &mut seq, + InvalidPolicy::Error, &mut state, &buffer[..i + 1], &mut consumed, @@ -1693,8 +1694,15 @@ pub(crate) enum DecodeState { Error, } +#[derive(Eq, PartialEq)] +pub(crate) enum InvalidPolicy { + Error, + Passthrough, +} + pub(crate) fn decode_input_byte( out_seq: &mut WString, + invalid_policy: InvalidPolicy, state: &mut mbstate_t, buffer: &[u8], consumed: &mut usize, @@ -1710,6 +1718,19 @@ pub(crate) fn decode_input_byte( out_seq.push(res); return Complete; } + let mut invalid = |out_seq: &mut WString, log_error: fn()| match invalid_policy { + InvalidPolicy::Error => { + (log_error)(); + Error + } + InvalidPolicy::Passthrough => { + for &b in &buffer[*consumed..] { + out_seq.push(encode_byte_to_char(b)); + } + *consumed = buffer.len(); + Complete + } + }; let mut codepoint = u32::from(res); match unsafe { mbrtowc( @@ -1721,9 +1742,7 @@ pub(crate) fn decode_input_byte( } as isize { -1 => { - FLOG!(reader, "Illegal input"); - *consumed += 1; - return Error; + return invalid(out_seq, || FLOG!(reader, "Illegal input encoding")); } -2 => { // Sequence not yet complete. @@ -1739,11 +1758,7 @@ pub(crate) fn decode_input_byte( return Complete; } } - for &b in &buffer[*consumed..] { - out_seq.push(encode_byte_to_char(b)); - *consumed += 1; - } - Complete + invalid(out_seq, || FLOG!(reader, "Illegal codepoint")) } pub(crate) fn unblock_input(mut wait_guard: MutexGuard>) -> bool { diff --git a/tests/checks/read.fish b/tests/checks/read.fish index bc96e3074..c3e087d94 100644 --- a/tests/checks/read.fish +++ b/tests/checks/read.fish @@ -431,3 +431,7 @@ printf \xf9\x98\xb1\x83\x8b | read -z out_of_range_codepoint set -S out_of_range_codepoint # CHECK: $out_of_range_codepoint: set in global scope, unexported, with 1 elements # CHECK: $out_of_range_codepoint[1]: |\Xf9\X98\Xb1\X83\X8b| + +printf \xff | { read invalid_utf8; set -S invalid_utf8 } +# CHECK: $invalid_utf8: set in global scope, unexported, with 1 elements +# CHECK: $invalid_utf8[1]: |\Xff|