From d34a300818e5d37c2b57cc9931804c2cfb8ecee0 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Tue, 29 May 2018 21:11:50 -0700
Subject: [PATCH] Add string split0

This adds a new string command split0, which splits on zero bytes.
split0 has superpowers because its output is not further split on
newlines when used in command substitutions.
---
 src/builtin_string.cpp | 75 +++++++++++++++++++++++-------------------
 src/exec.cpp           | 57 ++++++++++++++++++--------------
 src/io.h               |  7 ++--
 tests/string.err       |  6 ++++
 tests/string.in        | 18 ++++++++++
 tests/string.out       | 13 ++++++++
 6 files changed, 115 insertions(+), 61 deletions(-)

diff --git a/src/builtin_string.cpp b/src/builtin_string.cpp
index 4d5eccbac..113738e61 100644
--- a/src/builtin_string.cpp
+++ b/src/builtin_string.cpp
@@ -75,25 +75,29 @@ class arg_iterator_t {
     int argidx_;
     // If not using argv, a string to store bytes that have been read but not yet returned.
     std::string buffer_;
+    // If set, when reading from a stream, split on zeros instead of newlines.
+    const bool split0_;
     // Backing storage for the next() string.
     wcstring storage_;
     const io_streams_t &streams_;
 
-    /// \return the next argument from stdin
-    const wchar_t *get_arg_stdin() {
+    /// Reads the next argument from stdin, returning true if an argument was produced and false if
+    /// not. On true, the string is stored in storage_.
+    bool get_arg_stdin() {
         assert(string_args_from_stdin(streams_) && "should not be reading from stdin");
-        // Read in chunks from fd until buffer has a line.
+        // Read in chunks from fd until buffer has a line (or zero if split0_ is set).
+        const char sep = split0_ ? '\0' : '\n';
         size_t pos;
-        while ((pos = buffer_.find('\n')) == std::string::npos) {
+        while ((pos = buffer_.find(sep)) == std::string::npos) {
             char buf[STRING_CHUNK_SIZE];
             long n = read_blocked(streams_.stdin_fd, buf, STRING_CHUNK_SIZE);
             if (n == 0) {
                 // If we still have buffer contents, flush them,
-                // in case there was no trailing '\n'.
-                if (buffer_.empty()) return NULL;
+                // in case there was no trailing sep.
+                if (buffer_.empty()) return false;
                 storage_ = str2wcstring(buffer_);
                 buffer_.clear();
-                return storage_.c_str();
+                return true;
             }
             if (n == -1) {
                 // Some error happened. We can't do anything about it,
@@ -101,20 +105,21 @@ class arg_iterator_t {
                 // (read_blocked already retries for EAGAIN and EINTR)
                 storage_ = str2wcstring(buffer_);
                 buffer_.clear();
-                return NULL;
+                return false;
             }
             buffer_.append(buf, n);
         }
 
-        // Split the buffer on the '\n' and return the first part.
+        // Split the buffer on the sep and return the first part.
         storage_ = str2wcstring(buffer_, pos);
         buffer_.erase(0, pos + 1);
-        return storage_.c_str();
+        return true;
     }
 
    public:
-    arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams)
-        : argv_(argv), argidx_(argidx), streams_(streams) {}
+    arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams,
+                   bool split0 = false)
+        : argv_(argv), argidx_(argidx), split0_(split0), streams_(streams) {}
 
     const wcstring *nextstr() {
         if (string_args_from_stdin(streams_)) {
@@ -1037,7 +1042,8 @@ static int string_replace(parser_t &parser, io_streams_t &streams, int argc, wch
     return replacer->replace_count() > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
 }
 
-static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+static int string_split_maybe0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv,
+                               bool is_split0) {
     options_t opts;
     opts.quiet_valid = true;
     opts.right_valid = true;
@@ -1045,14 +1051,14 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
     opts.max = LONG_MAX;
     opts.no_empty_valid = true;
     int optind;
-    int retval = parse_opts(&opts, &optind, 1, argc, argv, parser, streams);
+    int retval = parse_opts(&opts, &optind, is_split0 ? 0 : 1, argc, argv, parser, streams);
     if (retval != STATUS_CMD_OK) return retval;
 
-    const wcstring sep(opts.arg1);
+    const wcstring sep = is_split0 ? wcstring(1, L'\0') : wcstring(opts.arg1);
 
     wcstring_list_t splits;
     size_t arg_count = 0;
-    arg_iterator_t aiter(argv, optind, streams);
+    arg_iterator_t aiter(argv, optind, streams, is_split0);
     while (const wcstring *arg = aiter.nextstr()) {
         if (opts.right) {
             split_about(arg->rbegin(), arg->rend(), sep.rbegin(), sep.rend(), &splits, opts.max, opts.no_empty);
@@ -1070,15 +1076,24 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
         std::reverse(splits.begin(), splits.end());
     }
 
+    const size_t split_count = splits.size();
     if (!opts.quiet) {
-        for (wcstring_list_t::const_iterator si = splits.begin(); si != splits.end(); ++si) {
-            streams.out.append(*si);
-            streams.out.append(L'\n');
+        auto &buff = streams.out.buffer();
+        for (const wcstring &split : splits) {
+            buff.append(split, separation_type_t::explicitly);
         }
     }
 
     // We split something if we have more split values than args.
-    return splits.size() > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR;
+    return split_count > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR;
+}
+
+static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+    return string_split_maybe0(parser, streams, argc, argv, false /* is_split0 */);
+}
+
+static int string_split0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+    return string_split_maybe0(parser, streams, argc, argv, true /* is_split0 */);
 }
 
 // Helper function to abstract the repeat logic from string_repeat
@@ -1256,19 +1271,13 @@ static const struct string_subcommand {
                    wchar_t **argv);                       //!OCLINT(unused param)
 }
 
-string_subcommands[] = {{L"escape", &string_escape},
-                        {L"join", &string_join},
-                        {L"length", &string_length},
-                        {L"match", &string_match},
-                        {L"replace", &string_replace},
-                        {L"split", &string_split},
-                        {L"sub", &string_sub},
-                        {L"trim", &string_trim},
-                        {L"lower", &string_lower},
-                        {L"upper", &string_upper},
-                        {L"repeat", &string_repeat},
-                        {L"unescape", &string_unescape},
-                        {NULL, NULL}};
+string_subcommands[] = {{L"escape", &string_escape},     {L"join", &string_join},
+                        {L"length", &string_length},     {L"match", &string_match},
+                        {L"replace", &string_replace},   {L"split", &string_split},
+                        {L"split0", &string_split0},     {L"sub", &string_sub},
+                        {L"trim", &string_trim},         {L"lower", &string_lower},
+                        {L"upper", &string_upper},       {L"repeat", &string_repeat},
+                        {L"unescape", &string_unescape}, {NULL, NULL}};
 
 /// The string builtin, for manipulating strings.
 int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) {
diff --git a/src/exec.cpp b/src/exec.cpp
index b9e56959e..25dc0f7fa 100644
--- a/src/exec.cpp
+++ b/src/exec.cpp
@@ -1204,34 +1204,41 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst, boo
     if (lst == NULL || io_buffer.get() == NULL) {
         return subcommand_status;
     }
+    // Walk over all the elements.
+    for (const auto &elem : io_buffer->buffer().elements()) {
+        if (elem.is_explicitly_separated()) {
+            // Just append this one.
+            lst->push_back(str2wcstring(elem.contents));
+            continue;
+        }
 
-    const std::string buffer_contents = io_buffer->buffer().newline_serialized();
-    const char *begin = buffer_contents.data();
-    const char *end = begin + buffer_contents.size();
-    if (split_output) {
-        const char *cursor = begin;
-        while (cursor < end) {
-            // Look for the next separator.
-            const char *stop = (const char *)memchr(cursor, '\n', end - cursor);
-            const bool hit_separator = (stop != NULL);
-            if (!hit_separator) {
-                // If it's not found, just use the end.
-                stop = end;
+        // Not explicitly separated. We have to split it explicitly.
+        assert(!elem.is_explicitly_separated() && "should not be explicitly separated");
+        const char *begin = elem.contents.data();
+        const char *end = begin + elem.contents.size();
+        if (split_output) {
+            const char *cursor = begin;
+            while (cursor < end) {
+                // Look for the next separator.
+                const char *stop = (const char *)memchr(cursor, '\n', end - cursor);
+                const bool hit_separator = (stop != NULL);
+                if (!hit_separator) {
+                    // If it's not found, just use the end.
+                    stop = end;
+                }
+                // Stop now points at the first character we do not want to copy.
+                lst->push_back(str2wcstring(cursor, stop - cursor));
+
+                // If we hit a separator, skip over it; otherwise we're at the end.
+                cursor = stop + (hit_separator ? 1 : 0);
             }
-            // Stop now points at the first character we do not want to copy.
-            const wcstring wc = str2wcstring(cursor, stop - cursor);
-            lst->push_back(wc);
-
-            // If we hit a separator, skip over it; otherwise we're at the end.
-            cursor = stop + (hit_separator ? 1 : 0);
+        } else {
+            // We're not splitting output, but we still want to trim off a trailing newline.
+            if (end != begin && end[-1] == '\n') {
+                --end;
+            }
+            lst->push_back(str2wcstring(begin, end - begin));
         }
-    } else {
-        // We're not splitting output, but we still want to trim off a trailing newline.
-        if (end != begin && end[-1] == '\n') {
-            --end;
-        }
-        const wcstring wc = str2wcstring(begin, end - begin);
-        lst->push_back(wc);
     }
 
     return subcommand_status;
diff --git a/src/io.h b/src/io.h
index ecc27be43..93ccab718 100644
--- a/src/io.h
+++ b/src/io.h
@@ -36,6 +36,7 @@ enum class separation_type_t {
 /// others which must be separated further by the user (e.g. via IFS).
 template <typename StringType>
 class separated_buffer_t {
+   public:
     struct element_t {
         StringType contents;
         separation_type_t separation;
@@ -46,6 +47,7 @@ class separated_buffer_t {
         bool is_explicitly_separated() const { return separation == separation_type_t::explicitly; }
     };
 
+   private:
     /// Limit on how much data we'll buffer. Zero means no limit.
     size_t buffer_limit_;
 
@@ -236,9 +238,6 @@ class io_buffer_t : public io_pipe_t {
     /// Access the underlying buffer.
     const separated_buffer_t<std::string> &buffer() const { return buffer_; }
 
-    /// Access the underlying buffer.
-    separated_buffer_t<std::string> &buffer() { return buffer_; }
-
     /// Function to append to the buffer.
     void append(const char *ptr, size_t count) { buffer_.append(ptr, ptr + count); }
 
@@ -301,6 +300,8 @@ class output_stream_t {
 
     void append(const wcstring &s) { buffer_.append(s.begin(), s.end()); }
 
+    separated_buffer_t<wcstring> &buffer() { return buffer_; }
+
     const separated_buffer_t<wcstring> &buffer() const { return buffer_; }
 
     void append(const wchar_t *s) { append(s, wcslen(s)); }
diff --git a/tests/string.err b/tests/string.err
index 98c1702b8..c7cd024fc 100644
--- a/tests/string.err
+++ b/tests/string.err
@@ -294,3 +294,9 @@ string repeat -l fakearg
 
 ####################
 # Check NUL
+
+####################
+# string split0
+
+####################
+# string split0 in functions
diff --git a/tests/string.in b/tests/string.in
index cfef38b68..60dcaaf01 100644
--- a/tests/string.in
+++ b/tests/string.in
@@ -340,4 +340,22 @@ printf 'a\0b' | string replace -r b g | string escape
 # TODO: These do not yet work!
 # printf 'a\0b' | string match '*b' | string escape
 
+logmsg string split0
+count (echo -ne 'abcdefghi' | string split0)
+count (echo -ne 'abc\x00def\x00ghi\x00' | string split0)
+count (echo -ne 'abc\x00def\x00ghi\x00\x00' | string split0)
+count (echo -ne 'abc\x00def\x00ghi' | string split0)
+count (echo -ne 'abc\ndef\x00ghi\x00' | string split0)
+count (echo -ne 'abc\ndef\nghi' | string split0)
+
+logmsg string split0 in functions
+# This function outputs some newline-separated content, and some
+# explicitly separated content.
+function dualsplit
+  echo alpha
+  echo beta
+  echo -ne 'gamma\x00delta' | string split0
+end
+count (dualsplit)
+
 exit 0
diff --git a/tests/string.out b/tests/string.out
index 7ff40deba..bded690e7 100644
--- a/tests/string.out
+++ b/tests/string.out
@@ -433,3 +433,16 @@ d
 a\x00b
 a\x00g
 a\x00g
+
+####################
+# string split0
+1
+3
+4
+3
+2
+1
+
+####################
+# string split0 in functions
+4