From d9ba27f58fedf96d5737e6ef6c454b76a0fbc2ce Mon Sep 17 00:00:00 2001 From: Johannes Altmanninger Date: Tue, 15 Apr 2025 00:28:47 +0200 Subject: [PATCH] builtin read: always handle out-of-range codepoints (Rust port regression) As mentioned in https://github.com/fish-shell/fish-shell/pull/9688#discussion_r1155089596, commit b77d1d0e2be (Stop crashing on invalid Unicode input, 2024-02-27), Rust's char type doesn't support arbitrary 32-bit values. Out-of-range Unicode codepoints would cause crashes. That commit addressed this by converting the encoded bytes (e.g. UTF-8) to special private-use-area characters that fish knows about. It didn't bother to update the code path in builtin read that relies on mbrtowc as well. Fix that. Move and rename parse_codepoint() and rename/reorder its input/output parameters. Note that the behavior is still wrong if builtin read can't decode the input; see the next commit. Fixes #11383 --- src/builtins/read.rs | 67 ++++++++++------------ src/input_common.rs | 123 +++++++++++++++++++++-------------------- tests/checks/read.fish | 5 ++ 3 files changed, 98 insertions(+), 97 deletions(-) diff --git a/src/builtins/read.rs b/src/builtins/read.rs index 24b7cb9f5..5a53f3dc3 100644 --- a/src/builtins/read.rs +++ b/src/builtins/read.rs @@ -11,8 +11,9 @@ use crate::env::Environment; use crate::env::READ_BYTE_LIMIT; use crate::env::{EnvVar, EnvVarFlags}; +use crate::input_common::decode_input_byte; use crate::input_common::terminal_protocols_disable_ifn; -use crate::libc::MB_CUR_MAX; +use crate::input_common::DecodeState; use crate::nix::isatty; use crate::reader::commandline_set_buffer; use crate::reader::ReaderConfig; @@ -23,7 +24,6 @@ use crate::wcstringutil::split_about; use crate::wcstringutil::split_string_tok; use crate::wutil; -use crate::wutil::encoding::mbrtowc; use crate::wutil::encoding::zero_mbstate; use crate::wutil::perror; use libc::SEEK_CUR; @@ -338,70 +338,61 @@ fn read_one_char_at_a_time( split_null: bool, ) -> BuiltinResult { let mut exit_res = Ok(SUCCESS); - let mut eof = false; let mut nbytes = 0; + let mut unconsumed = vec![]; + loop { - let mut finished = false; - let mut res = '\x00'; let mut state = zero_mbstate(); - while !finished { + let chars_read = buff.len(); + let res = loop { let mut b = [0_u8; 1]; match read_blocked(fd, &mut b) { Ok(0) | Err(_) => { - eof = true; - break; + break None; } _ => {} } let b = b[0]; - + unconsumed.push(b); nbytes += 1; - if MB_CUR_MAX() == 1 { - res = char::from(b); - finished = true; - } else { - let sz = unsafe { - mbrtowc( - std::ptr::addr_of_mut!(res).cast(), - std::ptr::addr_of!(b).cast(), - 1, - &mut state, - ) - } as isize; - if sz == -1 { + let mut consumed = 0; + match decode_input_byte(buff, &mut state, &unconsumed, &mut consumed) { + DecodeState::Incomplete => continue, + DecodeState::Complete => { + unconsumed.clear(); + break Some(buff.as_char_slice().last().unwrap()); + } + DecodeState::Error => { state = zero_mbstate(); - } else if sz != -2 { - finished = true; + unconsumed.clear(); } } - } + }; if nbytes > READ_BYTE_LIMIT.load(Ordering::Relaxed) { + // Historical behavior: do not include the codepoint that made us overflow. + buff.truncate(chars_read); exit_res = Err(STATUS_READ_TOO_MUCH); break; } - if eof { + let Some(&res) = res else { + // EOF + if buff.is_empty() { + exit_res = Err(STATUS_CMD_ERROR); + } + break; + }; + if res == if split_null { '\0' } else { '\n' } { + buff.pop(); break; } - if !split_null && res == '\n' { - break; - } - if split_null && res == '\0' { - break; - } - - buff.push(res); if nchars > 0 && nchars <= buff.len() { break; } } - if buff.is_empty() && eof { - exit_res = Err(STATUS_CMD_ERROR); - } - exit_res } diff --git a/src/input_common.rs b/src/input_common.rs index a0dce0cd0..0a24110c3 100644 --- a/src/input_common.rs +++ b/src/input_common.rs @@ -28,7 +28,6 @@ use crate::wutil::encoding::{mbrtowc, mbstate_t, zero_mbstate}; use crate::wutil::fish_wcstol; use std::collections::VecDeque; -use std::ops::ControlFlow; use std::os::fd::RawFd; use std::os::unix::ffi::OsStrExt; use std::ptr; @@ -872,14 +871,14 @@ fn try_readch(&mut self, blocking: bool) -> Option { _ => 0, }); } - match self.parse_codepoint( - &mut state, + match decode_input_byte( &mut seq, + &mut state, &buffer[..i + 1], &mut consumed, ) { - ControlFlow::Continue(/*codepoint_complete=*/ false) => (), - ControlFlow::Continue(/*codepoint_complete=*/ true) => { + DecodeState::Incomplete => (), + DecodeState::Complete => { if have_escape_prefix && i != 0 { have_escape_prefix = false; let c = seq.as_char_slice().last().unwrap(); @@ -889,7 +888,7 @@ fn try_readch(&mut self, blocking: bool) -> Option { break true; } } - ControlFlow::Break(()) => { + DecodeState::Error => { self.push_front(CharEvent::from_check_exit()); break false; } @@ -1002,59 +1001,6 @@ fn parse_escape_sequence( } } - fn parse_codepoint( - &mut self, - state: &mut mbstate_t, - out_seq: &mut WString, - buffer: &[u8], - consumed: &mut usize, - ) -> ControlFlow<(), bool> { - let mut res: char = '\0'; - let read_byte = *buffer.last().unwrap(); - if crate::libc::MB_CUR_MAX() == 1 { - // single-byte locale, all values are legal - // FIXME: this looks wrong, this falsely assumes that - // the single-byte locale is compatible with Unicode upper-ASCII. - res = read_byte.into(); - out_seq.push(res); - return ControlFlow::Continue(true); - } - let mut codepoint = u32::from(res); - match unsafe { - mbrtowc( - std::ptr::addr_of_mut!(codepoint), - std::ptr::addr_of!(read_byte).cast(), - 1, - state, - ) - } as isize - { - -1 => { - FLOG!(reader, "Illegal input"); - *consumed += 1; - return ControlFlow::Break(()); - } - -2 => { - // Sequence not yet complete. - return ControlFlow::Continue(false); - } - _ => (), - } - if let Some(res) = char::from_u32(codepoint) { - // Sequence complete. - if !fish_reserved_codepoint(res) { - *consumed += 1; - out_seq.push(res); - return ControlFlow::Continue(true); - } - } - for &b in &buffer[*consumed..] { - out_seq.push(encode_byte_to_char(b)); - *consumed += 1; - } - ControlFlow::Continue(true) - } - fn parse_csi(&mut self, buffer: &mut Vec) -> Option { // The maximum number of CSI parameters is defined by NPAR, nominally 16. let mut params = [[0_u32; 4]; 16]; @@ -1741,6 +1687,65 @@ fn has_lookahead(&self) -> bool { } } +pub(crate) enum DecodeState { + Incomplete, + Complete, + Error, +} + +pub(crate) fn decode_input_byte( + out_seq: &mut WString, + state: &mut mbstate_t, + buffer: &[u8], + consumed: &mut usize, +) -> DecodeState { + use DecodeState::*; + let mut res: char = '\0'; + let read_byte = *buffer.last().unwrap(); + if crate::libc::MB_CUR_MAX() == 1 { + // single-byte locale, all values are legal + // FIXME: this looks wrong, this falsely assumes that + // the single-byte locale is compatible with Unicode upper-ASCII. + res = read_byte.into(); + out_seq.push(res); + return Complete; + } + let mut codepoint = u32::from(res); + match unsafe { + mbrtowc( + std::ptr::addr_of_mut!(codepoint), + std::ptr::addr_of!(read_byte).cast(), + 1, + state, + ) + } as isize + { + -1 => { + FLOG!(reader, "Illegal input"); + *consumed += 1; + return Error; + } + -2 => { + // Sequence not yet complete. + return Incomplete; + } + _ => (), + } + if let Some(res) = char::from_u32(codepoint) { + // Sequence complete. + if !fish_reserved_codepoint(res) { + *consumed += 1; + out_seq.push(res); + return Complete; + } + } + for &b in &buffer[*consumed..] { + out_seq.push(encode_byte_to_char(b)); + *consumed += 1; + } + Complete +} + pub(crate) fn unblock_input(mut wait_guard: MutexGuard>) -> bool { if wait_guard.is_none() { return false; diff --git a/tests/checks/read.fish b/tests/checks/read.fish index f1ea2f898..bc96e3074 100644 --- a/tests/checks/read.fish +++ b/tests/checks/read.fish @@ -426,3 +426,8 @@ set -S var echo '1 {} "{}"' | read -lat var echo $var # CHECK: 1 {} {} + +printf \xf9\x98\xb1\x83\x8b | read -z out_of_range_codepoint +set -S out_of_range_codepoint +# CHECK: $out_of_range_codepoint: set in global scope, unexported, with 1 elements +# CHECK: $out_of_range_codepoint[1]: |\Xf9\X98\Xb1\X83\X8b|