Extract MoveWordStateMachine to a reader submodule

This has a very small dependency on the tokenizer.
This commit is contained in:
Johannes Altmanninger
2026-01-11 11:09:19 +01:00
parent 042117ee30
commit bd86f53b9f
4 changed files with 341 additions and 331 deletions

View File

@@ -5,4 +5,6 @@
#[allow(clippy::module_inception)]
pub mod reader;
mod word_motion;
pub use reader::*;

View File

@@ -49,6 +49,7 @@
use super::history_search::{ReaderHistorySearch, SearchMode, smartcase_flags};
use super::iothreads::{self, Debouncers};
use super::word_motion::{MoveWordStateMachine, MoveWordStyle};
use crate::abbrs::abbrs_match;
use crate::ast::{self, Kind, is_same_node};
use crate::builtins::shared::ErrorCode;
@@ -143,8 +144,7 @@
use crate::tokenizer::quote_end;
use crate::tokenizer::variable_assignment_equals_pos;
use crate::tokenizer::{
MoveWordStateMachine, MoveWordStyle, TOK_ACCEPT_UNFINISHED, TOK_SHOW_COMMENTS, TokenType,
Tokenizer, tok_command,
TOK_ACCEPT_UNFINISHED, TOK_SHOW_COMMENTS, TokenType, Tokenizer, tok_command,
};
use crate::tty_handoff::SCROLL_CONTENT_UP_TERMINFO_CODE;
use crate::tty_handoff::XTGETTCAP_QUERY_OS_NAME;

335
src/reader/word_motion.rs Normal file
View File

@@ -0,0 +1,335 @@
use crate::tokenizer::tok_is_string_character;
use crate::prelude::*;
use crate::reader::is_backslashed;
pub enum MoveWordStyle {
/// stop at punctuation
Punctuation,
/// stops at path components
PathComponents,
/// stops at whitespace
Whitespace,
}
/// Our state machine that implements "one word" movement or erasure.
pub struct MoveWordStateMachine {
state: u8,
style: MoveWordStyle,
}
impl MoveWordStateMachine {
pub fn new(style: MoveWordStyle) -> Self {
MoveWordStateMachine { state: 0, style }
}
pub fn consume_char(&mut self, text: &wstr, idx: usize) -> bool {
let c = text.as_char_slice()[idx];
match self.style {
MoveWordStyle::Punctuation => self.consume_char_punctuation(c),
MoveWordStyle::PathComponents => self.consume_char_path_components(text, idx, c),
MoveWordStyle::Whitespace => self.consume_char_whitespace(c),
}
}
#[cfg(test)]
fn reset(&mut self) {
self.state = 0;
}
fn consume_char_punctuation(&mut self, c: char) -> bool {
const S_ALWAYS_ONE: u8 = 0;
const S_REST: u8 = 1;
const S_WHITESPACE_REST: u8 = 2;
const S_WHITESPACE: u8 = 3;
const S_ALPHANUMERIC: u8 = 4;
const S_END: u8 = 5;
let mut consumed = false;
while self.state != S_END && !consumed {
match self.state {
S_ALWAYS_ONE => {
// Always consume the first character.
consumed = true;
if c.is_whitespace() {
self.state = S_WHITESPACE;
} else if c.is_alphanumeric() {
self.state = S_ALPHANUMERIC;
} else {
// Don't allow switching type (ws->nonws) after non-whitespace and
// non-alphanumeric.
self.state = S_REST;
}
}
S_REST => {
if c.is_whitespace() {
// Consume only trailing whitespace.
self.state = S_WHITESPACE_REST;
} else if c.is_alphanumeric() {
// Consume only alnums.
self.state = S_ALPHANUMERIC;
} else {
consumed = false;
self.state = S_END;
}
}
S_WHITESPACE_REST | S_WHITESPACE => {
// "whitespace" consumes whitespace and switches to alnums,
// "whitespace_rest" only consumes whitespace.
if c.is_whitespace() {
// Consumed whitespace.
consumed = true;
} else {
self.state = if self.state == S_WHITESPACE {
S_ALPHANUMERIC
} else {
S_END
};
}
}
S_ALPHANUMERIC => {
if c.is_alphanumeric() {
consumed = true; // consumed alphanumeric
} else {
self.state = S_END;
}
}
_ => {}
}
}
consumed
}
fn consume_char_path_components(&mut self, s: &wstr, idx: usize, c: char) -> bool {
const S_INITIAL_PUNCTUATION: u8 = 0;
const S_WHITESPACE: u8 = 1;
const S_SEPARATOR: u8 = 2;
const S_SLASH: u8 = 3;
const S_PATH_COMPONENT_CHARACTERS: u8 = 4;
const S_INITIAL_SEPARATOR: u8 = 5;
const S_END: u8 = 6;
let is_escaped = is_backslashed(s, idx);
let is_whitespace = c.is_whitespace() && !is_escaped;
let is_path_component_character =
is_path_component_character(c) || (c.is_whitespace() && is_escaped);
let mut consumed = false;
while self.state != S_END && !consumed {
match self.state {
S_INITIAL_PUNCTUATION => {
if !is_path_component_character && !is_whitespace {
self.state = S_INITIAL_SEPARATOR;
} else {
if !is_path_component_character {
consumed = true;
}
self.state = S_WHITESPACE;
}
}
S_WHITESPACE => {
if is_whitespace {
consumed = true; // consumed whitespace
} else if c == '/' || is_path_component_character {
self.state = S_SLASH; // path component
} else {
self.state = S_SEPARATOR; // path separator
}
}
S_SEPARATOR => {
if !is_whitespace && !is_path_component_character {
consumed = true; // consumed separator
} else {
self.state = S_END;
}
}
S_SLASH => {
if c == '/' {
consumed = true; // consumed slash
} else {
self.state = S_PATH_COMPONENT_CHARACTERS;
}
}
S_PATH_COMPONENT_CHARACTERS => {
if is_path_component_character {
consumed = true; // consumed string character except slash
} else {
self.state = S_END;
}
}
S_INITIAL_SEPARATOR => {
if is_path_component_character {
consumed = true;
self.state = S_PATH_COMPONENT_CHARACTERS;
} else if is_whitespace {
self.state = S_END;
} else {
consumed = true;
}
}
_ => {}
}
}
consumed
}
fn consume_char_whitespace(&mut self, c: char) -> bool {
// Consume a "word" of printable characters plus any leading whitespace.
const S_ALWAYS_ONE: u8 = 0;
const S_BLANK: u8 = 1;
const S_GRAPH: u8 = 2;
const S_END: u8 = 3;
let mut consumed = false;
while self.state != S_END && !consumed {
match self.state {
S_ALWAYS_ONE => {
// always consume the first character
// If it's not whitespace, only consume those from here.
consumed = true;
if !c.is_whitespace() {
self.state = S_GRAPH;
} else {
// If it's whitespace, keep consuming whitespace until the graphs.
self.state = S_BLANK;
}
}
S_BLANK => {
if c.is_whitespace() {
// consumed whitespace
consumed = true;
} else {
self.state = S_GRAPH;
}
}
S_GRAPH => {
if !c.is_whitespace() {
// consumed printable non-space
consumed = true;
} else {
self.state = S_END;
}
}
_ => {}
}
}
consumed
}
}
fn is_path_component_character(c: char) -> bool {
tok_is_string_character(c, None) && !L!("/={,}'\":@#").as_char_slice().contains(&c)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use super::{MoveWordStateMachine, MoveWordStyle};
use crate::prelude::*;
/// Test word motion (forward-word, etc.). Carets represent cursor stops.
#[test]
fn test_word_motion() {
#[derive(Eq, PartialEq)]
pub enum Direction {
Left,
Right,
}
fn validate_visitor(
direction: Direction,
style: MoveWordStyle,
line: &str,
on_failure: fn(&str),
) {
let mut command = WString::new();
let mut stops = HashSet::new();
// Carets represent stops and should be cut out of the command.
for c in line.chars() {
if c == '^' {
stops.insert(command.len());
} else {
command.push(c);
}
}
let (mut idx, end) = if direction == Direction::Left {
(*stops.iter().max().unwrap(), 0)
} else {
(*stops.iter().min().unwrap(), command.len())
};
stops.remove(&idx);
let mut sm = MoveWordStateMachine::new(style);
while idx != end {
let char_idx = if direction == Direction::Left {
idx - 1
} else {
idx
};
let will_stop = !sm.consume_char(&command, char_idx);
let expected_stop = stops.contains(&idx);
if will_stop != expected_stop {
on_failure(&format!(
"Expected to stop={expected_stop} at index {idx} but got stop={will_stop}. String: {command:?}"
));
}
// We don't expect to stop here next time.
if expected_stop {
stops.remove(&idx);
sm.reset();
} else if direction == Direction::Left {
idx -= 1;
} else {
idx += 1;
}
}
}
macro_rules! validate {
($direction:expr, $style:expr, $line:expr) => {
validate_visitor($direction, $style, $line, |error| assert!(false, "{error}"));
};
}
use Direction::*;
use MoveWordStyle::*;
// PathComponents tests
validate!(
Left,
PathComponents,
"^echo ^/^foo/^bar{^aaa,^bbb,^ccc}^bak/^"
);
validate!(Left, PathComponents, "^echo ^bak ^///^");
validate!(Left, PathComponents, "^aaa ^@ ^@^aaa^");
validate!(Left, PathComponents, "^aaa ^a ^@^aaa^");
validate!(Left, PathComponents, "^aaa ^@@@ ^@@^aa^");
validate!(Left, PathComponents, "^aa^@@ ^aa@@^a^");
validate!(Left, PathComponents, r#"^a\ ^b\ c/^d"^e\ f"^g"#);
validate!(Left, PathComponents, r#"^a\"^bc^"#);
validate!(Left, PathComponents, "^/^foo/^bar/^baz/^");
validate!(Left, PathComponents, "^echo ^--foo ^--bar^");
validate!(Left, PathComponents, "^echo ^hi ^> ^/^dev/^null^");
// General punctuation tests
validate!(Right, Punctuation, "^a^ bcd^");
validate!(Right, Punctuation, "a^b^ cde^");
validate!(Right, Punctuation, "^ab^ cde^");
validate!(Right, Punctuation, "^ab^&cd^ ^& ^e^ f^&");
validate!(Left, Punctuation, "^echo ^hello_^world.^txt^");
validate!(Right, Punctuation, "^echo^ hello^_world^.txt^");
validate!(Left, Punctuation, "echo ^foo_^foo_^foo/^/^/^/^/^ ^");
validate!(Right, Punctuation, "^echo^ foo^_foo^_foo^/^/^/^/^/ ^");
// General whitespace tests
validate!(Right, Whitespace, "^^a-b-c^ d-e-f");
validate!(Right, Whitespace, "^a-b-c^\n d-e-f^ ");
validate!(Right, Whitespace, "^a-b-c^\n\nd-e-f^ ");
}
}

View File

@@ -7,7 +7,6 @@
use crate::parse_constants::SOURCE_OFFSET_INVALID;
use crate::parser_keywords::parser_keywords_is_subcommand;
use crate::prelude::*;
use crate::reader::is_backslashed;
use crate::redirection::RedirectionMode;
use libc::{STDIN_FILENO, STDOUT_FILENO};
use nix::fcntl::OFlag;
@@ -103,21 +102,6 @@ pub struct PipeOrRedir {
pub consumed: usize,
}
pub enum MoveWordStyle {
/// stop at punctuation
Punctuation,
/// stops at path components
PathComponents,
/// stops at whitespace
Whitespace,
}
/// Our state machine that implements "one word" movement or erasure.
pub struct MoveWordStateMachine {
state: u8,
style: MoveWordStyle,
}
#[derive(Clone, Copy)]
pub struct TokFlags(pub u8);
@@ -893,7 +877,7 @@ pub fn comment_end(s: &wstr, mut pos: usize) -> usize {
/// Tests if this character can be a part of a string. Hash (#) starts a comment if it's the first
/// character in a token; otherwise it is considered a string character. See issue #953.
fn tok_is_string_character(c: char, next: Option<char>) -> bool {
pub fn tok_is_string_character(c: char, next: Option<char>) -> bool {
match c {
// Unconditional separators.
'\0' | ' ' | '\n' | '|' | '\t' | ';' | '\r' | '<' | '>' => false,
@@ -1174,208 +1158,6 @@ fn parse_fd(s: &wstr) -> RawFd {
s.parse().unwrap_or(-1)
}
impl MoveWordStateMachine {
pub fn new(style: MoveWordStyle) -> Self {
MoveWordStateMachine { state: 0, style }
}
pub fn consume_char(&mut self, text: &wstr, idx: usize) -> bool {
let c = text.as_char_slice()[idx];
match self.style {
MoveWordStyle::Punctuation => self.consume_char_punctuation(c),
MoveWordStyle::PathComponents => self.consume_char_path_components(text, idx, c),
MoveWordStyle::Whitespace => self.consume_char_whitespace(c),
}
}
pub fn reset(&mut self) {
self.state = 0;
}
fn consume_char_punctuation(&mut self, c: char) -> bool {
const S_ALWAYS_ONE: u8 = 0;
const S_REST: u8 = 1;
const S_WHITESPACE_REST: u8 = 2;
const S_WHITESPACE: u8 = 3;
const S_ALPHANUMERIC: u8 = 4;
const S_END: u8 = 5;
let mut consumed = false;
while self.state != S_END && !consumed {
match self.state {
S_ALWAYS_ONE => {
// Always consume the first character.
consumed = true;
if c.is_whitespace() {
self.state = S_WHITESPACE;
} else if c.is_alphanumeric() {
self.state = S_ALPHANUMERIC;
} else {
// Don't allow switching type (ws->nonws) after non-whitespace and
// non-alphanumeric.
self.state = S_REST;
}
}
S_REST => {
if c.is_whitespace() {
// Consume only trailing whitespace.
self.state = S_WHITESPACE_REST;
} else if c.is_alphanumeric() {
// Consume only alnums.
self.state = S_ALPHANUMERIC;
} else {
consumed = false;
self.state = S_END;
}
}
S_WHITESPACE_REST | S_WHITESPACE => {
// "whitespace" consumes whitespace and switches to alnums,
// "whitespace_rest" only consumes whitespace.
if c.is_whitespace() {
// Consumed whitespace.
consumed = true;
} else {
self.state = if self.state == S_WHITESPACE {
S_ALPHANUMERIC
} else {
S_END
};
}
}
S_ALPHANUMERIC => {
if c.is_alphanumeric() {
consumed = true; // consumed alphanumeric
} else {
self.state = S_END;
}
}
_ => {}
}
}
consumed
}
fn consume_char_path_components(&mut self, s: &wstr, idx: usize, c: char) -> bool {
const S_INITIAL_PUNCTUATION: u8 = 0;
const S_WHITESPACE: u8 = 1;
const S_SEPARATOR: u8 = 2;
const S_SLASH: u8 = 3;
const S_PATH_COMPONENT_CHARACTERS: u8 = 4;
const S_INITIAL_SEPARATOR: u8 = 5;
const S_END: u8 = 6;
let is_escaped = is_backslashed(s, idx);
let is_whitespace = c.is_whitespace() && !is_escaped;
let is_path_component_character =
is_path_component_character(c) || (c.is_whitespace() && is_escaped);
let mut consumed = false;
while self.state != S_END && !consumed {
match self.state {
S_INITIAL_PUNCTUATION => {
if !is_path_component_character && !is_whitespace {
self.state = S_INITIAL_SEPARATOR;
} else {
if !is_path_component_character {
consumed = true;
}
self.state = S_WHITESPACE;
}
}
S_WHITESPACE => {
if is_whitespace {
consumed = true; // consumed whitespace
} else if c == '/' || is_path_component_character {
self.state = S_SLASH; // path component
} else {
self.state = S_SEPARATOR; // path separator
}
}
S_SEPARATOR => {
if !is_whitespace && !is_path_component_character {
consumed = true; // consumed separator
} else {
self.state = S_END;
}
}
S_SLASH => {
if c == '/' {
consumed = true; // consumed slash
} else {
self.state = S_PATH_COMPONENT_CHARACTERS;
}
}
S_PATH_COMPONENT_CHARACTERS => {
if is_path_component_character {
consumed = true; // consumed string character except slash
} else {
self.state = S_END;
}
}
S_INITIAL_SEPARATOR => {
if is_path_component_character {
consumed = true;
self.state = S_PATH_COMPONENT_CHARACTERS;
} else if is_whitespace {
self.state = S_END;
} else {
consumed = true;
}
}
_ => {}
}
}
consumed
}
fn consume_char_whitespace(&mut self, c: char) -> bool {
// Consume a "word" of printable characters plus any leading whitespace.
const S_ALWAYS_ONE: u8 = 0;
const S_BLANK: u8 = 1;
const S_GRAPH: u8 = 2;
const S_END: u8 = 3;
let mut consumed = false;
while self.state != S_END && !consumed {
match self.state {
S_ALWAYS_ONE => {
// always consume the first character
// If it's not whitespace, only consume those from here.
consumed = true;
if !c.is_whitespace() {
self.state = S_GRAPH;
} else {
// If it's whitespace, keep consuming whitespace until the graphs.
self.state = S_BLANK;
}
}
S_BLANK => {
if c.is_whitespace() {
// consumed whitespace
consumed = true;
} else {
self.state = S_GRAPH;
}
}
S_GRAPH => {
if !c.is_whitespace() {
// consumed printable non-space
consumed = true;
} else {
self.state = S_END;
}
}
_ => {}
}
}
consumed
}
}
fn is_path_component_character(c: char) -> bool {
tok_is_string_character(c, None) && !L!("/={,}'\":@#").as_char_slice().contains(&c)
}
/// The position of the equal sign in a variable assignment like foo=bar.
///
/// Return the location of the equals sign, or none if the string does
@@ -1406,14 +1188,10 @@ pub fn variable_assignment_equals_pos(txt: &wstr) -> Option<usize> {
#[cfg(test)]
mod tests {
use super::{
MoveWordStateMachine, MoveWordStyle, PipeOrRedir, TokFlags, TokenType, Tokenizer,
TokenizerError,
};
use super::{PipeOrRedir, TokFlags, TokenType, Tokenizer, TokenizerError};
use crate::prelude::*;
use crate::redirection::RedirectionMode;
use libc::{STDERR_FILENO, STDOUT_FILENO};
use std::collections::HashSet;
#[test]
fn test_tokenizer() {
@@ -1601,109 +1379,4 @@ macro_rules! get_redir_mode {
assert_eq!(get_redir_mode!("3<&0"), RedirectionMode::Fd);
assert_eq!(get_redir_mode!("3</tmp/filetxt"), RedirectionMode::Input);
}
/// Test word motion (forward-word, etc.). Carets represent cursor stops.
#[test]
fn test_word_motion() {
#[derive(Eq, PartialEq)]
pub enum Direction {
Left,
Right,
}
fn validate_visitor(
direction: Direction,
style: MoveWordStyle,
line: &str,
on_failure: fn(&str),
) {
let mut command = WString::new();
let mut stops = HashSet::new();
// Carets represent stops and should be cut out of the command.
for c in line.chars() {
if c == '^' {
stops.insert(command.len());
} else {
command.push(c);
}
}
let (mut idx, end) = if direction == Direction::Left {
(*stops.iter().max().unwrap(), 0)
} else {
(*stops.iter().min().unwrap(), command.len())
};
stops.remove(&idx);
let mut sm = MoveWordStateMachine::new(style);
while idx != end {
let char_idx = if direction == Direction::Left {
idx - 1
} else {
idx
};
let will_stop = !sm.consume_char(&command, char_idx);
let expected_stop = stops.contains(&idx);
if will_stop != expected_stop {
on_failure(&format!(
"Expected to stop={expected_stop} at index {idx} but got stop={will_stop}. String: {command:?}"
));
}
// We don't expect to stop here next time.
if expected_stop {
stops.remove(&idx);
sm.reset();
} else if direction == Direction::Left {
idx -= 1;
} else {
idx += 1;
}
}
}
macro_rules! validate {
($direction:expr, $style:expr, $line:expr) => {
validate_visitor($direction, $style, $line, |error| assert!(false, "{error}"));
};
}
use Direction::*;
use MoveWordStyle::*;
// PathComponents tests
validate!(
Left,
PathComponents,
"^echo ^/^foo/^bar{^aaa,^bbb,^ccc}^bak/^"
);
validate!(Left, PathComponents, "^echo ^bak ^///^");
validate!(Left, PathComponents, "^aaa ^@ ^@^aaa^");
validate!(Left, PathComponents, "^aaa ^a ^@^aaa^");
validate!(Left, PathComponents, "^aaa ^@@@ ^@@^aa^");
validate!(Left, PathComponents, "^aa^@@ ^aa@@^a^");
validate!(Left, PathComponents, r#"^a\ ^b\ c/^d"^e\ f"^g"#);
validate!(Left, PathComponents, r#"^a\"^bc^"#);
validate!(Left, PathComponents, "^/^foo/^bar/^baz/^");
validate!(Left, PathComponents, "^echo ^--foo ^--bar^");
validate!(Left, PathComponents, "^echo ^hi ^> ^/^dev/^null^");
// General punctuation tests
validate!(Right, Punctuation, "^a^ bcd^");
validate!(Right, Punctuation, "a^b^ cde^");
validate!(Right, Punctuation, "^ab^ cde^");
validate!(Right, Punctuation, "^ab^&cd^ ^& ^e^ f^&");
validate!(Left, Punctuation, "^echo ^hello_^world.^txt^");
validate!(Right, Punctuation, "^echo^ hello^_world^.txt^");
validate!(Left, Punctuation, "echo ^foo_^foo_^foo/^/^/^/^/^ ^");
validate!(Right, Punctuation, "^echo^ foo^_foo^_foo^/^/^/^/^/ ^");
// General whitespace tests
validate!(Right, Whitespace, "^^a-b-c^ d-e-f");
validate!(Right, Whitespace, "^a-b-c^\n d-e-f^ ");
validate!(Right, Whitespace, "^a-b-c^\n\nd-e-f^ ");
}
}