builtin read: always handle out-of-range codepoints (Rust port regression)

As mentioned in
https://github.com/fish-shell/fish-shell/pull/9688#discussion_r1155089596,
commit b77d1d0e2b (Stop crashing on invalid Unicode input, 2024-02-27), Rust's
char type doesn't support arbitrary 32-bit values.  Out-of-range Unicode
codepoints would cause crashes.  That commit addressed this by converting
the encoded bytes (e.g. UTF-8) to special private-use-area characters that
fish knows about.  It didn't bother to update the code path in builtin read
that relies on mbrtowc as well.

Fix that. Move and rename parse_codepoint() and rename/reorder its input/output
parameters.

Note that the behavior is still wrong if builtin read can't decode the
input; see the next commit.

Fixes #11383
This commit is contained in:
Johannes Altmanninger
2025-04-15 00:28:47 +02:00
parent b061178606
commit d9ba27f58f
3 changed files with 98 additions and 97 deletions

View File

@@ -28,7 +28,6 @@
use crate::wutil::encoding::{mbrtowc, mbstate_t, zero_mbstate};
use crate::wutil::fish_wcstol;
use std::collections::VecDeque;
use std::ops::ControlFlow;
use std::os::fd::RawFd;
use std::os::unix::ffi::OsStrExt;
use std::ptr;
@@ -872,14 +871,14 @@ fn try_readch(&mut self, blocking: bool) -> Option<CharEvent> {
_ => 0,
});
}
match self.parse_codepoint(
&mut state,
match decode_input_byte(
&mut seq,
&mut state,
&buffer[..i + 1],
&mut consumed,
) {
ControlFlow::Continue(/*codepoint_complete=*/ false) => (),
ControlFlow::Continue(/*codepoint_complete=*/ true) => {
DecodeState::Incomplete => (),
DecodeState::Complete => {
if have_escape_prefix && i != 0 {
have_escape_prefix = false;
let c = seq.as_char_slice().last().unwrap();
@@ -889,7 +888,7 @@ fn try_readch(&mut self, blocking: bool) -> Option<CharEvent> {
break true;
}
}
ControlFlow::Break(()) => {
DecodeState::Error => {
self.push_front(CharEvent::from_check_exit());
break false;
}
@@ -1002,59 +1001,6 @@ fn parse_escape_sequence(
}
}
fn parse_codepoint(
&mut self,
state: &mut mbstate_t,
out_seq: &mut WString,
buffer: &[u8],
consumed: &mut usize,
) -> ControlFlow<(), bool> {
let mut res: char = '\0';
let read_byte = *buffer.last().unwrap();
if crate::libc::MB_CUR_MAX() == 1 {
// single-byte locale, all values are legal
// FIXME: this looks wrong, this falsely assumes that
// the single-byte locale is compatible with Unicode upper-ASCII.
res = read_byte.into();
out_seq.push(res);
return ControlFlow::Continue(true);
}
let mut codepoint = u32::from(res);
match unsafe {
mbrtowc(
std::ptr::addr_of_mut!(codepoint),
std::ptr::addr_of!(read_byte).cast(),
1,
state,
)
} as isize
{
-1 => {
FLOG!(reader, "Illegal input");
*consumed += 1;
return ControlFlow::Break(());
}
-2 => {
// Sequence not yet complete.
return ControlFlow::Continue(false);
}
_ => (),
}
if let Some(res) = char::from_u32(codepoint) {
// Sequence complete.
if !fish_reserved_codepoint(res) {
*consumed += 1;
out_seq.push(res);
return ControlFlow::Continue(true);
}
}
for &b in &buffer[*consumed..] {
out_seq.push(encode_byte_to_char(b));
*consumed += 1;
}
ControlFlow::Continue(true)
}
fn parse_csi(&mut self, buffer: &mut Vec<u8>) -> Option<KeyEvent> {
// The maximum number of CSI parameters is defined by NPAR, nominally 16.
let mut params = [[0_u32; 4]; 16];
@@ -1741,6 +1687,65 @@ fn has_lookahead(&self) -> bool {
}
}
pub(crate) enum DecodeState {
Incomplete,
Complete,
Error,
}
pub(crate) fn decode_input_byte(
out_seq: &mut WString,
state: &mut mbstate_t,
buffer: &[u8],
consumed: &mut usize,
) -> DecodeState {
use DecodeState::*;
let mut res: char = '\0';
let read_byte = *buffer.last().unwrap();
if crate::libc::MB_CUR_MAX() == 1 {
// single-byte locale, all values are legal
// FIXME: this looks wrong, this falsely assumes that
// the single-byte locale is compatible with Unicode upper-ASCII.
res = read_byte.into();
out_seq.push(res);
return Complete;
}
let mut codepoint = u32::from(res);
match unsafe {
mbrtowc(
std::ptr::addr_of_mut!(codepoint),
std::ptr::addr_of!(read_byte).cast(),
1,
state,
)
} as isize
{
-1 => {
FLOG!(reader, "Illegal input");
*consumed += 1;
return Error;
}
-2 => {
// Sequence not yet complete.
return Incomplete;
}
_ => (),
}
if let Some(res) = char::from_u32(codepoint) {
// Sequence complete.
if !fish_reserved_codepoint(res) {
*consumed += 1;
out_seq.push(res);
return Complete;
}
}
for &b in &buffer[*consumed..] {
out_seq.push(encode_byte_to_char(b));
*consumed += 1;
}
Complete
}
pub(crate) fn unblock_input(mut wait_guard: MutexGuard<Option<BlockingWait>>) -> bool {
if wait_guard.is_none() {
return false;