Compare commits

...

2 Commits

Author SHA1 Message Date
Johannes Altmanninger
42063c7bbe builtin read: --tokenize-raw option
Closes #11084
2025-06-26 01:17:22 +02:00
Johannes Altmanninger
f8743c2b20 builtin commandline: --tokens-raw to include all tokens
Part of #11084
2025-06-26 01:17:22 +02:00
5 changed files with 138 additions and 43 deletions

View File

@@ -83,7 +83,14 @@ The following options control how much is read and how it is stored:
**-n** or **--nchars** *NCHARS*
Makes ``read`` return after reading *NCHARS* characters or the end of the line, whichever comes first.
**-t** -or **--tokenize**
**-t** -or **--tokenize** or **--tokenize-raw**
Causes read to split the input into variables by the shell's tokenization rules.
This means it will honor quotes and escaping.
This option is of course incompatible with other options to control splitting like **--delimiter** and does not honor :envvar:`IFS` (like fish's tokenizer).
The **-t** -or **--tokenize** variants perform quote removal, so e.g. ``a\ b`` is stored as ``a b``.
However variables and command substitutions are not expanded.
**--tokenize-raw**
Causes read to split the input into variables by the shell's tokenization rules. This means it will honor quotes and escaping. This option is of course incompatible with other options to control splitting like **--delimiter** and does not honor :envvar:`IFS` (like fish's tokenizer). It saves the tokens in the manner they'd be passed to commands on the commandline, so e.g. ``a\ b`` is stored as ``a b``. Note that currently it leaves command substitutions intact along with the parentheses.
**-a** or **--list**

View File

@@ -1,4 +1,5 @@
use super::prelude::*;
use super::read::TokenOutputMode;
use crate::ast::{self, Kind, Leaf};
use crate::common::{unescape_string, UnescapeFlags, UnescapeStringStyle};
use crate::complete::Completion;
@@ -44,12 +45,6 @@ enum AppendMode {
Append,
}
enum TokenOutputMode {
Expanded,
Raw,
Unescaped,
}
/// Replace/append/insert the selection with/at/after the specified string.
///
/// \param begin beginning of selection
@@ -223,13 +218,15 @@ fn write_part(
if cut_at_cursor && token.end() >= pos {
break;
}
let is_redirection_target = in_redirection;
in_redirection = token.type_ == TokenType::redirect;
if is_redirection_target && token.type_ == TokenType::string {
continue;
}
if token.type_ != TokenType::string {
continue;
if token_mode != TokenOutputMode::Raw {
let is_redirection_target = in_redirection;
in_redirection = token.type_ == TokenType::redirect;
if is_redirection_target && token.type_ == TokenType::string {
continue;
}
if token.type_ != TokenType::string {
continue;
}
}
let token_text = tok.text_of(&token);

View File

@@ -19,6 +19,7 @@
use crate::reader::commandline_set_buffer;
use crate::reader::ReaderConfig;
use crate::reader::{reader_pop, reader_push, reader_readline};
use crate::tokenizer::Tok;
use crate::tokenizer::Tokenizer;
use crate::tokenizer::TOK_ACCEPT_UNFINISHED;
use crate::tokenizer::TOK_ARGUMENT_LIST;
@@ -32,6 +33,13 @@
use std::os::fd::RawFd;
use std::sync::atomic::Ordering;
#[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) enum TokenOutputMode {
Expanded,
Raw,
Unescaped,
}
#[derive(Default)]
struct Options {
print_help: bool,
@@ -43,7 +51,7 @@ struct Options {
// If a delimiter was given. Used to distinguish between the default
// empty string and a given empty delimiter.
delimiter: Option<WString>,
tokenize: bool,
token_mode: Option<TokenOutputMode>, // never expanded
shell: bool,
array: bool,
silent: bool,
@@ -82,10 +90,19 @@ fn new() -> Self {
wopt(L!("shell"), ArgType::NoArgument, 'S'),
wopt(L!("silent"), ArgType::NoArgument, 's'),
wopt(L!("tokenize"), ArgType::NoArgument, 't'),
wopt(L!("tokenize-raw"), ArgType::NoArgument, '\x01'),
wopt(L!("unexport"), ArgType::NoArgument, 'u'),
wopt(L!("universal"), ArgType::NoArgument, 'U'),
];
fn tokenize_flag(token_mode: TokenOutputMode) -> &'static wstr {
match token_mode {
TokenOutputMode::Expanded => panic!(),
TokenOutputMode::Raw => L!("--tokenize-raw"),
TokenOutputMode::Unescaped => L!("--tokenize"),
}
}
fn parse_cmd_opts(
args: &mut [&wstr],
parser: &Parser,
@@ -165,8 +182,28 @@ fn parse_cmd_opts(
'S' => {
opts.shell = true;
}
't' => {
opts.tokenize = true;
't' | '\x01' => {
let new_mode = match opt {
't' => TokenOutputMode::Unescaped,
'\x01' => TokenOutputMode::Raw,
_ => unreachable!(),
};
if let Some(old_mode) = opts.token_mode {
if old_mode != new_mode {
streams.err.append(wgettext_fmt!(
BUILTIN_ERR_COMBO2,
cmd,
wgettext_fmt!(
"%s and %s are mutually exclusive",
tokenize_flag(old_mode),
tokenize_flag(new_mode),
)
));
builtin_print_error_trailer(parser, streams.err, cmd);
return Err(STATUS_INVALID_ARGS);
}
}
opts.token_mode = Some(new_mode);
}
'U' => {
opts.place |= EnvMode::UNIVERSAL;
@@ -482,24 +519,34 @@ fn validate_read_args(
return Err(STATUS_INVALID_ARGS);
}
if opts.tokenize && opts.delimiter.is_some() {
streams.err.append(wgettext_fmt!(
BUILTIN_ERR_COMBO2_EXCLUSIVE,
cmd,
"--delimiter",
"--tokenize"
));
return Err(STATUS_INVALID_ARGS);
fn tokenize_flag(token_mode: TokenOutputMode) -> &'static wstr {
match token_mode {
TokenOutputMode::Expanded => panic!(),
TokenOutputMode::Raw => L!("--tokenize-raw"),
TokenOutputMode::Unescaped => L!("--tokenize"),
}
}
if opts.tokenize && opts.one_line {
streams.err.append(wgettext_fmt!(
BUILTIN_ERR_COMBO2_EXCLUSIVE,
cmd,
"--line",
"--tokenize"
));
return Err(STATUS_INVALID_ARGS);
if let Some(token_mode) = opts.token_mode {
if opts.delimiter.is_some() {
streams.err.append(wgettext_fmt!(
BUILTIN_ERR_COMBO2_EXCLUSIVE,
cmd,
"--delimiter",
tokenize_flag(token_mode),
));
return Err(STATUS_INVALID_ARGS);
}
if opts.one_line {
streams.err.append(wgettext_fmt!(
BUILTIN_ERR_COMBO2_EXCLUSIVE,
cmd,
"--line",
tokenize_flag(token_mode),
));
return Err(STATUS_INVALID_ARGS);
}
}
// Verify all variable names.
@@ -632,18 +679,28 @@ pub fn read(parser: &Parser, streams: &mut IoStreams, argv: &mut [&wstr]) -> Bui
return exit_res;
}
if opts.tokenize {
if let Some(token_mode) = opts.token_mode {
let mut tok = Tokenizer::new(&buff, TOK_ACCEPT_UNFINISHED | TOK_ARGUMENT_LIST);
let token_text = |tokenizer: &mut Tokenizer<'_>, token: &Tok| -> WString {
let mut text = Cow::Borrowed(tokenizer.text_of(token));
match token_mode {
TokenOutputMode::Expanded => panic!(),
TokenOutputMode::Raw => (),
TokenOutputMode::Unescaped => {
if let Some(unescaped) =
unescape_string(&text, UnescapeStringStyle::default())
{
text = Cow::Owned(unescaped);
}
}
};
text.into_owned()
};
if opts.array {
// Array mode: assign each token as a separate element of the sole var.
let mut tokens = vec![];
while let Some(t) = tok.next() {
let text = tok.text_of(&t);
if let Some(out) = unescape_string(text, UnescapeStringStyle::default()) {
tokens.push(out);
} else {
tokens.push(text.to_owned());
}
tokens.push(token_text(&mut tok, &t));
}
parser.set_var_and_fire(argv[var_ptr], opts.place, tokens);
@@ -653,9 +710,7 @@ pub fn read(parser: &Parser, streams: &mut IoStreams, argv: &mut [&wstr]) -> Bui
let Some(t) = tok.next() else {
break;
};
let text = tok.text_of(&t);
let out = unescape_string(text, UnescapeStringStyle::default())
.unwrap_or_else(|| text.to_owned());
let out = token_text(&mut tok, &t);
parser.set_var_and_fire(argv[var_ptr], opts.place, vec![out]);
var_ptr += 1;
}

View File

@@ -50,3 +50,7 @@ commandline --input "echo > {a,b}" --tokens-expanded
commandline --input "echo {arg1,arg2} <in >out" --tokens-raw
# CHECK: echo
# CHECK: {arg1,arg2}
# CHECK: <
# CHECK: in
# CHECK: >
# CHECK: out

View File

@@ -423,6 +423,38 @@ set -S var
# CHECK: $var[1]: |1|
# CHECK: $var[2]: |}|
# Raw tokens into named variables
echo 'echo "&" a\ b &
second line (dropped)' | read -l --tokenize-raw head tail
set -S head tail
# CHECK: $head: set in local scope, unexported, with 1 elements
# CHECK: $head[1]: |echo|
# CHECK: $tail: set in local scope, unexported, with 1 elements
# CHECK: $tail[1]: |"&" a\\ b &|
# Raw tokens into list
echo 'echo "&" & a\ b
second line (dropped)' | read -l --tokenize-raw -a rawlist
set -S rawlist
# CHECK: $rawlist: set in local scope, unexported, with 4 elements
# CHECK: $rawlist[1]: |echo|
# CHECK: $rawlist[2]: |"&"|
# CHECK: $rawlist[3]: |&|
# CHECK: $rawlist[4]: |a\\ b|
echo 'echo "&" & a\ b
second line' | read -l --tokenize-raw -a rawlist_null -z
set -S rawlist_null
# CHECK: $rawlist_null: set in local scope, unexported, with 8 elements
# CHECK: $rawlist_null[1]: |echo|
# CHECK: $rawlist_null[2]: |"&"|
# CHECK: $rawlist_null[3]: |&|
# CHECK: $rawlist_null[4]: |a\\ b|
# CHECK: $rawlist_null[5]: |\n|
# CHECK: $rawlist_null[6]: |second|
# CHECK: $rawlist_null[7]: |line|
# CHECK: $rawlist_null[8]: |\n|
echo '1 {} "{}"' | read -lat var
echo $var
# CHECK: 1 {} {}