builtin read: always handle out-of-range codepoints (Rust port regression)

As mentioned in
https://github.com/fish-shell/fish-shell/pull/9688#discussion_r1155089596,
commit b77d1d0e2b (Stop crashing on invalid Unicode input, 2024-02-27), Rust's
char type doesn't support arbitrary 32-bit values.  Out-of-range Unicode
codepoints would cause crashes.  That commit addressed this by converting
the encoded bytes (e.g. UTF-8) to special private-use-area characters that
fish knows about.  It didn't bother to update the code path in builtin read
that relies on mbrtowc as well.

Fix that. Move and rename parse_codepoint() and rename/reorder its input/output
parameters.

Fixes #11383

(cherry picked from commit d9ba27f58f)
This commit is contained in:
Johannes Altmanninger
2025-04-15 00:28:47 +02:00
parent 3191ac13e5
commit bc3e3ae029
3 changed files with 98 additions and 103 deletions

View File

@@ -12,8 +12,9 @@
use crate::env::Environment;
use crate::env::READ_BYTE_LIMIT;
use crate::env::{EnvVar, EnvVarFlags};
use crate::input_common::decode_input_byte;
use crate::input_common::terminal_protocols_disable_ifn;
use crate::libc::MB_CUR_MAX;
use crate::input_common::DecodeState;
use crate::nix::isatty;
use crate::reader::commandline_set_buffer;
use crate::reader::ReaderConfig;
@@ -23,7 +24,6 @@
use crate::wcstringutil::split_about;
use crate::wcstringutil::split_string_tok;
use crate::wutil;
use crate::wutil::encoding::mbrtowc;
use crate::wutil::encoding::zero_mbstate;
use crate::wutil::perror;
use libc::SEEK_CUR;
@@ -341,70 +341,61 @@ fn read_one_char_at_a_time(
split_null: bool,
) -> Option<c_int> {
let mut exit_res = STATUS_CMD_OK;
let mut eof = false;
let mut nbytes = 0;
let mut unconsumed = vec![];
loop {
let mut finished = false;
let mut res = '\x00';
let mut state = zero_mbstate();
while !finished {
let chars_read = buff.len();
let res = loop {
let mut b = [0_u8; 1];
match read_blocked(fd, &mut b) {
Ok(0) | Err(_) => {
eof = true;
break;
break None;
}
_ => {}
}
let b = b[0];
unconsumed.push(b);
nbytes += 1;
if MB_CUR_MAX() == 1 {
res = char::from(b);
finished = true;
} else {
let sz = unsafe {
mbrtowc(
std::ptr::addr_of_mut!(res).cast(),
std::ptr::addr_of!(b).cast(),
1,
&mut state,
)
} as isize;
if sz == -1 {
let mut consumed = 0;
match decode_input_byte(buff, &mut state, &unconsumed, &mut consumed) {
DecodeState::Incomplete => continue,
DecodeState::Complete => {
unconsumed.clear();
break Some(buff.as_char_slice().last().unwrap());
}
DecodeState::Error => {
state = zero_mbstate();
} else if sz != -2 {
finished = true;
unconsumed.clear();
}
}
}
};
if nbytes > READ_BYTE_LIMIT.load(Ordering::Relaxed) {
// Historical behavior: do not include the codepoint that made us overflow.
buff.truncate(chars_read);
exit_res = STATUS_READ_TOO_MUCH;
break;
}
if eof {
let Some(&res) = res else {
// EOF
if buff.is_empty() {
exit_res = STATUS_CMD_ERROR;
}
break;
};
if res == if split_null { '\0' } else { '\n' } {
buff.pop();
break;
}
if !split_null && res == '\n' {
break;
}
if split_null && res == '\0' {
break;
}
buff.push(res);
if nchars > 0 && nchars <= buff.len() {
break;
}
}
if buff.is_empty() && eof {
exit_res = STATUS_CMD_ERROR;
}
exit_res
}

View File

@@ -21,7 +21,6 @@
use crate::wutil::encoding::{mbrtowc, mbstate_t, zero_mbstate};
use crate::wutil::fish_wcstol;
use std::collections::VecDeque;
use std::ops::ControlFlow;
use std::os::fd::RawFd;
use std::os::unix::ffi::OsStrExt;
use std::ptr;
@@ -702,14 +701,14 @@ fn try_readch(&mut self, blocking: bool) -> Option<CharEvent> {
_ => 0,
});
}
match self.parse_codepoint(
&mut state,
match decode_input_byte(
&mut seq,
&mut state,
&buffer[..i + 1],
&mut consumed,
) {
ControlFlow::Continue(/*codepoint_complete=*/ false) => (),
ControlFlow::Continue(/*codepoint_complete=*/ true) => {
DecodeState::Incomplete => (),
DecodeState::Complete => {
if have_escape_prefix && i != 0 {
have_escape_prefix = false;
let c = seq.as_char_slice().last().unwrap();
@@ -719,7 +718,7 @@ fn try_readch(&mut self, blocking: bool) -> Option<CharEvent> {
break true;
}
}
ControlFlow::Break(()) => {
DecodeState::Error => {
self.push_front(CharEvent::from_check_exit());
break false;
}
@@ -794,65 +793,6 @@ fn parse_escape_sequence(
}
}
fn parse_codepoint(
&mut self,
state: &mut mbstate_t,
out_seq: &mut WString,
buffer: &[u8],
consumed: &mut usize,
) -> ControlFlow<(), bool> {
let mut res: char = '\0';
let read_byte = *buffer.last().unwrap();
if crate::libc::MB_CUR_MAX() == 1 {
// single-byte locale, all values are legal
// FIXME: this looks wrong, this falsely assumes that
// the single-byte locale is compatible with Unicode upper-ASCII.
res = read_byte.into();
out_seq.push(res);
return ControlFlow::Continue(true);
}
let mut codepoint = u32::from(res);
let sz = unsafe {
mbrtowc(
std::ptr::addr_of_mut!(codepoint).cast(),
std::ptr::addr_of!(read_byte).cast(),
1,
state,
)
} as isize;
match sz {
-1 => {
FLOG!(reader, "Illegal input");
*consumed += 1;
return ControlFlow::Break(());
}
-2 => {
// Sequence not yet complete.
return ControlFlow::Continue(false);
}
0 => {
// Actual nul char.
*consumed += 1;
out_seq.push('\0');
return ControlFlow::Continue(true);
}
_ => (),
}
if let Some(res) = char::from_u32(codepoint) {
// Sequence complete.
if !fish_reserved_codepoint(res) {
*consumed += 1;
out_seq.push(res);
return ControlFlow::Continue(true);
}
}
for &b in &buffer[*consumed..] {
out_seq.push(encode_byte_to_char(b));
*consumed += 1;
}
ControlFlow::Continue(true)
}
fn parse_csi(&mut self, buffer: &mut Vec<u8>) -> Option<Key> {
let mut next_char = |zelf: &mut Self| zelf.try_readb(buffer).unwrap_or(0xff);
// The maximum number of CSI parameters is defined by NPAR, nominally 16.
@@ -1322,6 +1262,65 @@ fn has_lookahead(&self) -> bool {
}
}
pub(crate) enum DecodeState {
Incomplete,
Complete,
Error,
}
pub(crate) fn decode_input_byte(
out_seq: &mut WString,
state: &mut mbstate_t,
buffer: &[u8],
consumed: &mut usize,
) -> DecodeState {
use DecodeState::*;
let mut res: char = '\0';
let read_byte = *buffer.last().unwrap();
if crate::libc::MB_CUR_MAX() == 1 {
// single-byte locale, all values are legal
// FIXME: this looks wrong, this falsely assumes that
// the single-byte locale is compatible with Unicode upper-ASCII.
res = read_byte.into();
out_seq.push(res);
return Complete;
}
let mut codepoint = u32::from(res);
match unsafe {
mbrtowc(
std::ptr::addr_of_mut!(codepoint).cast(),
std::ptr::addr_of!(read_byte).cast(),
1,
state,
)
} as isize
{
-1 => {
FLOG!(reader, "Illegal input");
*consumed += 1;
return Error;
}
-2 => {
// Sequence not yet complete.
return Incomplete;
}
_ => (),
}
if let Some(res) = char::from_u32(codepoint) {
// Sequence complete.
if !fish_reserved_codepoint(res) {
*consumed += 1;
out_seq.push(res);
return Complete;
}
}
for &b in &buffer[*consumed..] {
out_seq.push(encode_byte_to_char(b));
*consumed += 1;
}
Complete
}
/// A simple, concrete implementation of InputEventQueuer.
pub struct InputEventQueue {
data: InputData,