From 42063c7bbe36e6533dedad01a7d864389c575df1 Mon Sep 17 00:00:00 2001 From: Johannes Altmanninger Date: Mon, 5 May 2025 14:49:01 +0200 Subject: [PATCH] builtin read: --tokenize-raw option Closes #11084 --- doc_src/cmds/read.rst | 9 ++- src/builtins/commandline.rs | 8 +-- src/builtins/read.rs | 113 +++++++++++++++++++++++++++--------- tests/checks/read.fish | 32 ++++++++++ 4 files changed, 125 insertions(+), 37 deletions(-) diff --git a/doc_src/cmds/read.rst b/doc_src/cmds/read.rst index 66463a7d5..58bfefb69 100644 --- a/doc_src/cmds/read.rst +++ b/doc_src/cmds/read.rst @@ -83,7 +83,14 @@ The following options control how much is read and how it is stored: **-n** or **--nchars** *NCHARS* Makes ``read`` return after reading *NCHARS* characters or the end of the line, whichever comes first. -**-t** -or **--tokenize** +**-t** -or **--tokenize** or **--tokenize-raw** + Causes read to split the input into variables by the shell's tokenization rules. + This means it will honor quotes and escaping. + This option is of course incompatible with other options to control splitting like **--delimiter** and does not honor :envvar:`IFS` (like fish's tokenizer). + The **-t** -or **--tokenize** variants perform quote removal, so e.g. ``a\ b`` is stored as ``a b``. + However variables and command substitutions are not expanded. + +**--tokenize-raw** Causes read to split the input into variables by the shell's tokenization rules. This means it will honor quotes and escaping. This option is of course incompatible with other options to control splitting like **--delimiter** and does not honor :envvar:`IFS` (like fish's tokenizer). It saves the tokens in the manner they'd be passed to commands on the commandline, so e.g. ``a\ b`` is stored as ``a b``. Note that currently it leaves command substitutions intact along with the parentheses. **-a** or **--list** diff --git a/src/builtins/commandline.rs b/src/builtins/commandline.rs index 4014dcf2f..013fb21cf 100644 --- a/src/builtins/commandline.rs +++ b/src/builtins/commandline.rs @@ -1,4 +1,5 @@ use super::prelude::*; +use super::read::TokenOutputMode; use crate::ast::{self, Kind, Leaf}; use crate::common::{unescape_string, UnescapeFlags, UnescapeStringStyle}; use crate::complete::Completion; @@ -44,13 +45,6 @@ enum AppendMode { Append, } -#[derive(Eq, PartialEq)] -enum TokenOutputMode { - Expanded, - Raw, - Unescaped, -} - /// Replace/append/insert the selection with/at/after the specified string. /// /// \param begin beginning of selection diff --git a/src/builtins/read.rs b/src/builtins/read.rs index 9f7f4e74d..f38316f2c 100644 --- a/src/builtins/read.rs +++ b/src/builtins/read.rs @@ -19,6 +19,7 @@ use crate::reader::commandline_set_buffer; use crate::reader::ReaderConfig; use crate::reader::{reader_pop, reader_push, reader_readline}; +use crate::tokenizer::Tok; use crate::tokenizer::Tokenizer; use crate::tokenizer::TOK_ACCEPT_UNFINISHED; use crate::tokenizer::TOK_ARGUMENT_LIST; @@ -32,6 +33,13 @@ use std::os::fd::RawFd; use std::sync::atomic::Ordering; +#[derive(Clone, Copy, Eq, PartialEq)] +pub(crate) enum TokenOutputMode { + Expanded, + Raw, + Unescaped, +} + #[derive(Default)] struct Options { print_help: bool, @@ -43,7 +51,7 @@ struct Options { // If a delimiter was given. Used to distinguish between the default // empty string and a given empty delimiter. delimiter: Option, - tokenize: bool, + token_mode: Option, // never expanded shell: bool, array: bool, silent: bool, @@ -82,10 +90,19 @@ fn new() -> Self { wopt(L!("shell"), ArgType::NoArgument, 'S'), wopt(L!("silent"), ArgType::NoArgument, 's'), wopt(L!("tokenize"), ArgType::NoArgument, 't'), + wopt(L!("tokenize-raw"), ArgType::NoArgument, '\x01'), wopt(L!("unexport"), ArgType::NoArgument, 'u'), wopt(L!("universal"), ArgType::NoArgument, 'U'), ]; +fn tokenize_flag(token_mode: TokenOutputMode) -> &'static wstr { + match token_mode { + TokenOutputMode::Expanded => panic!(), + TokenOutputMode::Raw => L!("--tokenize-raw"), + TokenOutputMode::Unescaped => L!("--tokenize"), + } +} + fn parse_cmd_opts( args: &mut [&wstr], parser: &Parser, @@ -165,8 +182,28 @@ fn parse_cmd_opts( 'S' => { opts.shell = true; } - 't' => { - opts.tokenize = true; + 't' | '\x01' => { + let new_mode = match opt { + 't' => TokenOutputMode::Unescaped, + '\x01' => TokenOutputMode::Raw, + _ => unreachable!(), + }; + if let Some(old_mode) = opts.token_mode { + if old_mode != new_mode { + streams.err.append(wgettext_fmt!( + BUILTIN_ERR_COMBO2, + cmd, + wgettext_fmt!( + "%s and %s are mutually exclusive", + tokenize_flag(old_mode), + tokenize_flag(new_mode), + ) + )); + builtin_print_error_trailer(parser, streams.err, cmd); + return Err(STATUS_INVALID_ARGS); + } + } + opts.token_mode = Some(new_mode); } 'U' => { opts.place |= EnvMode::UNIVERSAL; @@ -482,24 +519,34 @@ fn validate_read_args( return Err(STATUS_INVALID_ARGS); } - if opts.tokenize && opts.delimiter.is_some() { - streams.err.append(wgettext_fmt!( - BUILTIN_ERR_COMBO2_EXCLUSIVE, - cmd, - "--delimiter", - "--tokenize" - )); - return Err(STATUS_INVALID_ARGS); + fn tokenize_flag(token_mode: TokenOutputMode) -> &'static wstr { + match token_mode { + TokenOutputMode::Expanded => panic!(), + TokenOutputMode::Raw => L!("--tokenize-raw"), + TokenOutputMode::Unescaped => L!("--tokenize"), + } } - if opts.tokenize && opts.one_line { - streams.err.append(wgettext_fmt!( - BUILTIN_ERR_COMBO2_EXCLUSIVE, - cmd, - "--line", - "--tokenize" - )); - return Err(STATUS_INVALID_ARGS); + if let Some(token_mode) = opts.token_mode { + if opts.delimiter.is_some() { + streams.err.append(wgettext_fmt!( + BUILTIN_ERR_COMBO2_EXCLUSIVE, + cmd, + "--delimiter", + tokenize_flag(token_mode), + )); + return Err(STATUS_INVALID_ARGS); + } + + if opts.one_line { + streams.err.append(wgettext_fmt!( + BUILTIN_ERR_COMBO2_EXCLUSIVE, + cmd, + "--line", + tokenize_flag(token_mode), + )); + return Err(STATUS_INVALID_ARGS); + } } // Verify all variable names. @@ -632,18 +679,28 @@ pub fn read(parser: &Parser, streams: &mut IoStreams, argv: &mut [&wstr]) -> Bui return exit_res; } - if opts.tokenize { + if let Some(token_mode) = opts.token_mode { let mut tok = Tokenizer::new(&buff, TOK_ACCEPT_UNFINISHED | TOK_ARGUMENT_LIST); + let token_text = |tokenizer: &mut Tokenizer<'_>, token: &Tok| -> WString { + let mut text = Cow::Borrowed(tokenizer.text_of(token)); + match token_mode { + TokenOutputMode::Expanded => panic!(), + TokenOutputMode::Raw => (), + TokenOutputMode::Unescaped => { + if let Some(unescaped) = + unescape_string(&text, UnescapeStringStyle::default()) + { + text = Cow::Owned(unescaped); + } + } + }; + text.into_owned() + }; if opts.array { // Array mode: assign each token as a separate element of the sole var. let mut tokens = vec![]; while let Some(t) = tok.next() { - let text = tok.text_of(&t); - if let Some(out) = unescape_string(text, UnescapeStringStyle::default()) { - tokens.push(out); - } else { - tokens.push(text.to_owned()); - } + tokens.push(token_text(&mut tok, &t)); } parser.set_var_and_fire(argv[var_ptr], opts.place, tokens); @@ -653,9 +710,7 @@ pub fn read(parser: &Parser, streams: &mut IoStreams, argv: &mut [&wstr]) -> Bui let Some(t) = tok.next() else { break; }; - let text = tok.text_of(&t); - let out = unescape_string(text, UnescapeStringStyle::default()) - .unwrap_or_else(|| text.to_owned()); + let out = token_text(&mut tok, &t); parser.set_var_and_fire(argv[var_ptr], opts.place, vec![out]); var_ptr += 1; } diff --git a/tests/checks/read.fish b/tests/checks/read.fish index c3e087d94..14f3de0fc 100644 --- a/tests/checks/read.fish +++ b/tests/checks/read.fish @@ -423,6 +423,38 @@ set -S var # CHECK: $var[1]: |1| # CHECK: $var[2]: |}| +# Raw tokens into named variables +echo 'echo "&" a\ b & +second line (dropped)' | read -l --tokenize-raw head tail +set -S head tail +# CHECK: $head: set in local scope, unexported, with 1 elements +# CHECK: $head[1]: |echo| +# CHECK: $tail: set in local scope, unexported, with 1 elements +# CHECK: $tail[1]: |"&" a\\ b &| + +# Raw tokens into list +echo 'echo "&" & a\ b +second line (dropped)' | read -l --tokenize-raw -a rawlist +set -S rawlist +# CHECK: $rawlist: set in local scope, unexported, with 4 elements +# CHECK: $rawlist[1]: |echo| +# CHECK: $rawlist[2]: |"&"| +# CHECK: $rawlist[3]: |&| +# CHECK: $rawlist[4]: |a\\ b| + +echo 'echo "&" & a\ b +second line' | read -l --tokenize-raw -a rawlist_null -z +set -S rawlist_null +# CHECK: $rawlist_null: set in local scope, unexported, with 8 elements +# CHECK: $rawlist_null[1]: |echo| +# CHECK: $rawlist_null[2]: |"&"| +# CHECK: $rawlist_null[3]: |&| +# CHECK: $rawlist_null[4]: |a\\ b| +# CHECK: $rawlist_null[5]: |\n| +# CHECK: $rawlist_null[6]: |second| +# CHECK: $rawlist_null[7]: |line| +# CHECK: $rawlist_null[8]: |\n| + echo '1 {} "{}"' | read -lat var echo $var # CHECK: 1 {} {}