From c3626a303113122f388ae1e7ec39cf2000beacca Mon Sep 17 00:00:00 2001
From: Johannes Altmanninger <aclopte@gmail.com>
Date: Mon, 5 May 2025 14:49:01 +0200
Subject: [PATCH] builtin read: --tokenize-raw option

Users have tried to get a list of all tokens -- including operators
-- using "commandline --tokens-raw".  That one has been deprecated
by cc2ca60baae (commandline.rst: deprecate --tokens-raw option,
2025-05-05).  Part of the reason is that the above command is broken
for multi-line tokens.

Let's support this use case in a way that's less ambiguous.

Closes #11084
---
 CHANGELOG.rst               |   1 +
 doc_src/cmds/read.rst       |   8 ++-
 po/de.po                    |   5 ++
 po/en.po                    |   5 ++
 po/fr.po                    |   5 ++
 po/pl.po                    |   5 ++
 po/pt_BR.po                 |   5 ++
 po/sv.po                    |   5 ++
 po/zh_CN.po                 |   5 ++
 src/builtins/commandline.rs |   7 +--
 src/builtins/read.rs        | 113 +++++++++++++++++++++++++++---------
 tests/checks/read.fish      |  32 ++++++++++
 12 files changed, 159 insertions(+), 37 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f0d0ade44..2cfe42aa8 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -48,6 +48,7 @@ Scripting improvements
 - The :doc:`psub <cmds/psub>` command now allows combining ``--suffix`` with ``--fifo`` (:issue:`11729`).
 - Builtin :doc:`argparse <cmds/argparse>` has seen many improvements, see :ref:`below <changelog-4.1-argparse>`.
 - The :doc:`string pad <cmds/string-pad>` command now has a ``-C/--center`` option.
+- The :doc:`read <cmds/read>` builtin has learned the ``--tokenize-raw`` option to tokenize without quote removal (:issue:`11084`).
 
 Interactive improvements
 ------------------------
diff --git a/doc_src/cmds/read.rst b/doc_src/cmds/read.rst
index 66463a7d5..adc7b31c4 100644
--- a/doc_src/cmds/read.rst
+++ b/doc_src/cmds/read.rst
@@ -83,8 +83,12 @@ The following options control how much is read and how it is stored:
 **-n** or **--nchars** *NCHARS*
     Makes ``read`` return after reading *NCHARS* characters or the end of the line, whichever comes first.
 
-**-t** -or **--tokenize**
-    Causes read to split the input into variables by the shell's tokenization rules. This means it will honor quotes and escaping. This option is of course incompatible with other options to control splitting like **--delimiter** and does not honor :envvar:`IFS` (like fish's tokenizer). It saves the tokens in the manner they'd be passed to commands on the commandline, so e.g. ``a\ b`` is stored as ``a b``. Note that currently it leaves command substitutions intact along with the parentheses.
+**-t**, **--tokenize** or **--tokenize-raw**
+    Causes read to split the input into variables by the shell's tokenization rules.
+    This means it will honor quotes and escaping.
+    This option is of course incompatible with other options to control splitting like **--delimiter** and does not honor :envvar:`IFS` (like fish's tokenizer).
+    The **-t** -or **--tokenize** variants perform quote removal, so e.g. ``a\ b`` is stored as ``a b``.
+    However variables and command substitutions are not expanded.
 
 **-a** or **--list**
     Stores the result as a list in a single variable. This option is also available as **--array** for backwards compatibility.
diff --git a/po/de.po b/po/de.po
index 99387901e..ccc9d0fd4 100644
--- a/po/de.po
+++ b/po/de.po
@@ -811,6 +811,11 @@ msgstr ""
 msgid "%s"
 msgstr ""
 
+#
+#, c-format
+msgid "%s and %s are mutually exclusive"
+msgstr ""
+
 #, c-format
 msgid "%s, version %s"
 msgstr ""
diff --git a/po/en.po b/po/en.po
index 5e252232e..d5ef4e551 100644
--- a/po/en.po
+++ b/po/en.po
@@ -809,6 +809,11 @@ msgstr ""
 msgid "%s"
 msgstr ""
 
+#
+#, c-format
+msgid "%s and %s are mutually exclusive"
+msgstr ""
+
 #, c-format
 msgid "%s, version %s"
 msgstr ""
diff --git a/po/fr.po b/po/fr.po
index fee9965fb..d55a071e5 100644
--- a/po/fr.po
+++ b/po/fr.po
@@ -910,6 +910,11 @@ msgstr ""
 msgid "%s"
 msgstr ""
 
+#
+#, c-format
+msgid "%s and %s are mutually exclusive"
+msgstr ""
+
 #, c-format
 msgid "%s, version %s"
 msgstr ""
diff --git a/po/pl.po b/po/pl.po
index 0736d43c3..5c7d1d72e 100644
--- a/po/pl.po
+++ b/po/pl.po
@@ -805,6 +805,11 @@ msgstr ""
 msgid "%s"
 msgstr ""
 
+#
+#, c-format
+msgid "%s and %s are mutually exclusive"
+msgstr ""
+
 #, c-format
 msgid "%s, version %s"
 msgstr ""
diff --git a/po/pt_BR.po b/po/pt_BR.po
index fb997d082..0deaf0091 100644
--- a/po/pt_BR.po
+++ b/po/pt_BR.po
@@ -810,6 +810,11 @@ msgstr ""
 msgid "%s"
 msgstr ""
 
+#
+#, c-format
+msgid "%s and %s are mutually exclusive"
+msgstr ""
+
 #, c-format
 msgid "%s, version %s"
 msgstr ""
diff --git a/po/sv.po b/po/sv.po
index 54d1d1ed8..034bb2136 100644
--- a/po/sv.po
+++ b/po/sv.po
@@ -806,6 +806,11 @@ msgstr ""
 msgid "%s"
 msgstr ""
 
+#
+#, c-format
+msgid "%s and %s are mutually exclusive"
+msgstr ""
+
 #, c-format
 msgid "%s, version %s"
 msgstr ""
diff --git a/po/zh_CN.po b/po/zh_CN.po
index 07c64a504..20947dda4 100644
--- a/po/zh_CN.po
+++ b/po/zh_CN.po
@@ -803,6 +803,11 @@ msgstr "%lu\n"
 msgid "%s"
 msgstr ""
 
+#
+#, c-format
+msgid "%s and %s are mutually exclusive"
+msgstr ""
+
 #, c-format
 msgid "%s, version %s"
 msgstr ""
diff --git a/src/builtins/commandline.rs b/src/builtins/commandline.rs
index 7a6783257..11c15816c 100644
--- a/src/builtins/commandline.rs
+++ b/src/builtins/commandline.rs
@@ -1,4 +1,5 @@
 use super::prelude::*;
+use super::read::TokenOutputMode;
 use crate::ast::{self, Kind, Leaf};
 use crate::common::{unescape_string, UnescapeFlags, UnescapeStringStyle};
 use crate::complete::Completion;
@@ -44,12 +45,6 @@ enum AppendMode {
     Append,
 }
 
-enum TokenOutputMode {
-    Expanded,
-    Raw,
-    Unescaped,
-}
-
 /// Replace/append/insert the selection with/at/after the specified string.
 ///
 /// \param begin beginning of selection
diff --git a/src/builtins/read.rs b/src/builtins/read.rs
index 31b391aec..d6ecd0057 100644
--- a/src/builtins/read.rs
+++ b/src/builtins/read.rs
@@ -19,6 +19,7 @@
 use crate::reader::reader_save_screen_state;
 use crate::reader::ReaderConfig;
 use crate::reader::{reader_pop, reader_push, reader_readline};
+use crate::tokenizer::Tok;
 use crate::tokenizer::Tokenizer;
 use crate::tokenizer::TOK_ACCEPT_UNFINISHED;
 use crate::tokenizer::TOK_ARGUMENT_LIST;
@@ -33,6 +34,13 @@
 use std::os::fd::RawFd;
 use std::sync::atomic::Ordering;
 
+#[derive(Clone, Copy, Eq, PartialEq)]
+pub(crate) enum TokenOutputMode {
+    Expanded,
+    Raw,
+    Unescaped,
+}
+
 #[derive(Default)]
 struct Options {
     print_help: bool,
@@ -44,7 +52,7 @@ struct Options {
     // If a delimiter was given. Used to distinguish between the default
     // empty string and a given empty delimiter.
     delimiter: Option<WString>,
-    tokenize: bool,
+    token_mode: Option<TokenOutputMode>, // never expanded
     shell: bool,
     array: bool,
     silent: bool,
@@ -83,10 +91,19 @@ fn new() -> Self {
     wopt(L!("shell"), ArgType::NoArgument, 'S'),
     wopt(L!("silent"), ArgType::NoArgument, 's'),
     wopt(L!("tokenize"), ArgType::NoArgument, 't'),
+    wopt(L!("tokenize-raw"), ArgType::NoArgument, '\x01'),
     wopt(L!("unexport"), ArgType::NoArgument, 'u'),
     wopt(L!("universal"), ArgType::NoArgument, 'U'),
 ];
 
+fn tokenize_flag(token_mode: TokenOutputMode) -> &'static wstr {
+    match token_mode {
+        TokenOutputMode::Expanded => panic!(),
+        TokenOutputMode::Raw => L!("--tokenize-raw"),
+        TokenOutputMode::Unescaped => L!("--tokenize"),
+    }
+}
+
 fn parse_cmd_opts(
     args: &mut [&wstr],
     parser: &Parser,
@@ -166,8 +183,28 @@ fn parse_cmd_opts(
             'S' => {
                 opts.shell = true;
             }
-            't' => {
-                opts.tokenize = true;
+            't' | '\x01' => {
+                let new_mode = match opt {
+                    't' => TokenOutputMode::Unescaped,
+                    '\x01' => TokenOutputMode::Raw,
+                    _ => unreachable!(),
+                };
+                if let Some(old_mode) = opts.token_mode {
+                    if old_mode != new_mode {
+                        streams.err.append(wgettext_fmt!(
+                            BUILTIN_ERR_COMBO2,
+                            cmd,
+                            wgettext_fmt!(
+                                "%s and %s are mutually exclusive",
+                                tokenize_flag(old_mode),
+                                tokenize_flag(new_mode),
+                            )
+                        ));
+                        builtin_print_error_trailer(parser, streams.err, cmd);
+                        return Err(STATUS_INVALID_ARGS);
+                    }
+                }
+                opts.token_mode = Some(new_mode);
             }
             'U' => {
                 opts.place |= EnvMode::UNIVERSAL;
@@ -490,24 +527,34 @@ fn validate_read_args(
         return Err(STATUS_INVALID_ARGS);
     }
 
-    if opts.tokenize && opts.delimiter.is_some() {
-        streams.err.append(wgettext_fmt!(
-            BUILTIN_ERR_COMBO2_EXCLUSIVE,
-            cmd,
-            "--delimiter",
-            "--tokenize"
-        ));
-        return Err(STATUS_INVALID_ARGS);
+    fn tokenize_flag(token_mode: TokenOutputMode) -> &'static wstr {
+        match token_mode {
+            TokenOutputMode::Expanded => panic!(),
+            TokenOutputMode::Raw => L!("--tokenize-raw"),
+            TokenOutputMode::Unescaped => L!("--tokenize"),
+        }
     }
 
-    if opts.tokenize && opts.one_line {
-        streams.err.append(wgettext_fmt!(
-            BUILTIN_ERR_COMBO2_EXCLUSIVE,
-            cmd,
-            "--line",
-            "--tokenize"
-        ));
-        return Err(STATUS_INVALID_ARGS);
+    if let Some(token_mode) = opts.token_mode {
+        if opts.delimiter.is_some() {
+            streams.err.append(wgettext_fmt!(
+                BUILTIN_ERR_COMBO2_EXCLUSIVE,
+                cmd,
+                "--delimiter",
+                tokenize_flag(token_mode),
+            ));
+            return Err(STATUS_INVALID_ARGS);
+        }
+
+        if opts.one_line {
+            streams.err.append(wgettext_fmt!(
+                BUILTIN_ERR_COMBO2_EXCLUSIVE,
+                cmd,
+                "--line",
+                tokenize_flag(token_mode),
+            ));
+            return Err(STATUS_INVALID_ARGS);
+        }
     }
 
     // Verify all variable names.
@@ -640,18 +687,28 @@ pub fn read(parser: &Parser, streams: &mut IoStreams, argv: &mut [&wstr]) -> Bui
             return exit_res;
         }
 
-        if opts.tokenize {
+        if let Some(token_mode) = opts.token_mode {
             let mut tok = Tokenizer::new(&buff, TOK_ACCEPT_UNFINISHED | TOK_ARGUMENT_LIST);
+            let token_text = |tokenizer: &mut Tokenizer<'_>, token: &Tok| -> WString {
+                let mut text = Cow::Borrowed(tokenizer.text_of(token));
+                match token_mode {
+                    TokenOutputMode::Expanded => panic!(),
+                    TokenOutputMode::Raw => (),
+                    TokenOutputMode::Unescaped => {
+                        if let Some(unescaped) =
+                            unescape_string(&text, UnescapeStringStyle::default())
+                        {
+                            text = Cow::Owned(unescaped);
+                        }
+                    }
+                };
+                text.into_owned()
+            };
             if opts.array {
                 // Array mode: assign each token as a separate element of the sole var.
                 let mut tokens = vec![];
                 while let Some(t) = tok.next() {
-                    let text = tok.text_of(&t);
-                    if let Some(out) = unescape_string(text, UnescapeStringStyle::default()) {
-                        tokens.push(out);
-                    } else {
-                        tokens.push(text.to_owned());
-                    }
+                    tokens.push(token_text(&mut tok, &t));
                 }
 
                 parser.set_var_and_fire(argv[var_ptr], opts.place, tokens);
@@ -661,9 +718,7 @@ pub fn read(parser: &Parser, streams: &mut IoStreams, argv: &mut [&wstr]) -> Bui
                     let Some(t) = tok.next() else {
                         break;
                     };
-                    let text = tok.text_of(&t);
-                    let out = unescape_string(text, UnescapeStringStyle::default())
-                        .unwrap_or_else(|| text.to_owned());
+                    let out = token_text(&mut tok, &t);
                     parser.set_var_and_fire(argv[var_ptr], opts.place, vec![out]);
                     var_ptr += 1;
                 }
diff --git a/tests/checks/read.fish b/tests/checks/read.fish
index c3e087d94..14f3de0fc 100644
--- a/tests/checks/read.fish
+++ b/tests/checks/read.fish
@@ -423,6 +423,38 @@ set -S var
 # CHECK: $var[1]: |1|
 # CHECK: $var[2]: |}|
 
+# Raw tokens into named variables
+echo 'echo "&" a\ b &
+second line (dropped)' | read -l --tokenize-raw head tail
+set -S head tail
+# CHECK: $head: set in local scope, unexported, with 1 elements
+# CHECK: $head[1]: |echo|
+# CHECK: $tail: set in local scope, unexported, with 1 elements
+# CHECK: $tail[1]: |"&" a\\ b &|
+
+# Raw tokens into list
+echo 'echo "&" & a\ b
+second line (dropped)' | read -l --tokenize-raw -a rawlist
+set -S rawlist
+# CHECK: $rawlist: set in local scope, unexported, with 4 elements
+# CHECK: $rawlist[1]: |echo|
+# CHECK: $rawlist[2]: |"&"|
+# CHECK: $rawlist[3]: |&|
+# CHECK: $rawlist[4]: |a\\ b|
+
+echo 'echo "&" & a\ b
+second line' | read -l --tokenize-raw -a rawlist_null -z
+set -S rawlist_null
+# CHECK: $rawlist_null: set in local scope, unexported, with 8 elements
+# CHECK: $rawlist_null[1]: |echo|
+# CHECK: $rawlist_null[2]: |"&"|
+# CHECK: $rawlist_null[3]: |&|
+# CHECK: $rawlist_null[4]: |a\\ b|
+# CHECK: $rawlist_null[5]: |\n|
+# CHECK: $rawlist_null[6]: |second|
+# CHECK: $rawlist_null[7]: |line|
+# CHECK: $rawlist_null[8]: |\n|
+
 echo '1  {} "{}"' | read -lat var
 echo $var
 # CHECK: 1 {} {}