From d34a300818e5d37c2b57cc9931804c2cfb8ecee0 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Tue, 29 May 2018 21:11:50 -0700 Subject: [PATCH] Add string split0 This adds a new string command split0, which splits on zero bytes. split0 has superpowers because its output is not further split on newlines when used in command substitutions. --- src/builtin_string.cpp | 75 +++++++++++++++++++++++------------------- src/exec.cpp | 57 ++++++++++++++++++-------------- src/io.h | 7 ++-- tests/string.err | 6 ++++ tests/string.in | 18 ++++++++++ tests/string.out | 13 ++++++++ 6 files changed, 115 insertions(+), 61 deletions(-) diff --git a/src/builtin_string.cpp b/src/builtin_string.cpp index 4d5eccbac..113738e61 100644 --- a/src/builtin_string.cpp +++ b/src/builtin_string.cpp @@ -75,25 +75,29 @@ class arg_iterator_t { int argidx_; // If not using argv, a string to store bytes that have been read but not yet returned. std::string buffer_; + // If set, when reading from a stream, split on zeros instead of newlines. + const bool split0_; // Backing storage for the next() string. wcstring storage_; const io_streams_t &streams_; - /// \return the next argument from stdin - const wchar_t *get_arg_stdin() { + /// Reads the next argument from stdin, returning true if an argument was produced and false if + /// not. On true, the string is stored in storage_. + bool get_arg_stdin() { assert(string_args_from_stdin(streams_) && "should not be reading from stdin"); - // Read in chunks from fd until buffer has a line. + // Read in chunks from fd until buffer has a line (or zero if split0_ is set). + const char sep = split0_ ? '\0' : '\n'; size_t pos; - while ((pos = buffer_.find('\n')) == std::string::npos) { + while ((pos = buffer_.find(sep)) == std::string::npos) { char buf[STRING_CHUNK_SIZE]; long n = read_blocked(streams_.stdin_fd, buf, STRING_CHUNK_SIZE); if (n == 0) { // If we still have buffer contents, flush them, - // in case there was no trailing '\n'. - if (buffer_.empty()) return NULL; + // in case there was no trailing sep. + if (buffer_.empty()) return false; storage_ = str2wcstring(buffer_); buffer_.clear(); - return storage_.c_str(); + return true; } if (n == -1) { // Some error happened. We can't do anything about it, @@ -101,20 +105,21 @@ class arg_iterator_t { // (read_blocked already retries for EAGAIN and EINTR) storage_ = str2wcstring(buffer_); buffer_.clear(); - return NULL; + return false; } buffer_.append(buf, n); } - // Split the buffer on the '\n' and return the first part. + // Split the buffer on the sep and return the first part. storage_ = str2wcstring(buffer_, pos); buffer_.erase(0, pos + 1); - return storage_.c_str(); + return true; } public: - arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams) - : argv_(argv), argidx_(argidx), streams_(streams) {} + arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams, + bool split0 = false) + : argv_(argv), argidx_(argidx), split0_(split0), streams_(streams) {} const wcstring *nextstr() { if (string_args_from_stdin(streams_)) { @@ -1037,7 +1042,8 @@ static int string_replace(parser_t &parser, io_streams_t &streams, int argc, wch return replacer->replace_count() > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; } -static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { +static int string_split_maybe0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv, + bool is_split0) { options_t opts; opts.quiet_valid = true; opts.right_valid = true; @@ -1045,14 +1051,14 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar opts.max = LONG_MAX; opts.no_empty_valid = true; int optind; - int retval = parse_opts(&opts, &optind, 1, argc, argv, parser, streams); + int retval = parse_opts(&opts, &optind, is_split0 ? 0 : 1, argc, argv, parser, streams); if (retval != STATUS_CMD_OK) return retval; - const wcstring sep(opts.arg1); + const wcstring sep = is_split0 ? wcstring(1, L'\0') : wcstring(opts.arg1); wcstring_list_t splits; size_t arg_count = 0; - arg_iterator_t aiter(argv, optind, streams); + arg_iterator_t aiter(argv, optind, streams, is_split0); while (const wcstring *arg = aiter.nextstr()) { if (opts.right) { split_about(arg->rbegin(), arg->rend(), sep.rbegin(), sep.rend(), &splits, opts.max, opts.no_empty); @@ -1070,15 +1076,24 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar std::reverse(splits.begin(), splits.end()); } + const size_t split_count = splits.size(); if (!opts.quiet) { - for (wcstring_list_t::const_iterator si = splits.begin(); si != splits.end(); ++si) { - streams.out.append(*si); - streams.out.append(L'\n'); + auto &buff = streams.out.buffer(); + for (const wcstring &split : splits) { + buff.append(split, separation_type_t::explicitly); } } // We split something if we have more split values than args. - return splits.size() > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR; + return split_count > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + +static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { + return string_split_maybe0(parser, streams, argc, argv, false /* is_split0 */); +} + +static int string_split0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { + return string_split_maybe0(parser, streams, argc, argv, true /* is_split0 */); } // Helper function to abstract the repeat logic from string_repeat @@ -1256,19 +1271,13 @@ static const struct string_subcommand { wchar_t **argv); //!OCLINT(unused param) } -string_subcommands[] = {{L"escape", &string_escape}, - {L"join", &string_join}, - {L"length", &string_length}, - {L"match", &string_match}, - {L"replace", &string_replace}, - {L"split", &string_split}, - {L"sub", &string_sub}, - {L"trim", &string_trim}, - {L"lower", &string_lower}, - {L"upper", &string_upper}, - {L"repeat", &string_repeat}, - {L"unescape", &string_unescape}, - {NULL, NULL}}; +string_subcommands[] = {{L"escape", &string_escape}, {L"join", &string_join}, + {L"length", &string_length}, {L"match", &string_match}, + {L"replace", &string_replace}, {L"split", &string_split}, + {L"split0", &string_split0}, {L"sub", &string_sub}, + {L"trim", &string_trim}, {L"lower", &string_lower}, + {L"upper", &string_upper}, {L"repeat", &string_repeat}, + {L"unescape", &string_unescape}, {NULL, NULL}}; /// The string builtin, for manipulating strings. int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) { diff --git a/src/exec.cpp b/src/exec.cpp index b9e56959e..25dc0f7fa 100644 --- a/src/exec.cpp +++ b/src/exec.cpp @@ -1204,34 +1204,41 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst, boo if (lst == NULL || io_buffer.get() == NULL) { return subcommand_status; } + // Walk over all the elements. + for (const auto &elem : io_buffer->buffer().elements()) { + if (elem.is_explicitly_separated()) { + // Just append this one. + lst->push_back(str2wcstring(elem.contents)); + continue; + } - const std::string buffer_contents = io_buffer->buffer().newline_serialized(); - const char *begin = buffer_contents.data(); - const char *end = begin + buffer_contents.size(); - if (split_output) { - const char *cursor = begin; - while (cursor < end) { - // Look for the next separator. - const char *stop = (const char *)memchr(cursor, '\n', end - cursor); - const bool hit_separator = (stop != NULL); - if (!hit_separator) { - // If it's not found, just use the end. - stop = end; + // Not explicitly separated. We have to split it explicitly. + assert(!elem.is_explicitly_separated() && "should not be explicitly separated"); + const char *begin = elem.contents.data(); + const char *end = begin + elem.contents.size(); + if (split_output) { + const char *cursor = begin; + while (cursor < end) { + // Look for the next separator. + const char *stop = (const char *)memchr(cursor, '\n', end - cursor); + const bool hit_separator = (stop != NULL); + if (!hit_separator) { + // If it's not found, just use the end. + stop = end; + } + // Stop now points at the first character we do not want to copy. + lst->push_back(str2wcstring(cursor, stop - cursor)); + + // If we hit a separator, skip over it; otherwise we're at the end. + cursor = stop + (hit_separator ? 1 : 0); } - // Stop now points at the first character we do not want to copy. - const wcstring wc = str2wcstring(cursor, stop - cursor); - lst->push_back(wc); - - // If we hit a separator, skip over it; otherwise we're at the end. - cursor = stop + (hit_separator ? 1 : 0); + } else { + // We're not splitting output, but we still want to trim off a trailing newline. + if (end != begin && end[-1] == '\n') { + --end; + } + lst->push_back(str2wcstring(begin, end - begin)); } - } else { - // We're not splitting output, but we still want to trim off a trailing newline. - if (end != begin && end[-1] == '\n') { - --end; - } - const wcstring wc = str2wcstring(begin, end - begin); - lst->push_back(wc); } return subcommand_status; diff --git a/src/io.h b/src/io.h index ecc27be43..93ccab718 100644 --- a/src/io.h +++ b/src/io.h @@ -36,6 +36,7 @@ enum class separation_type_t { /// others which must be separated further by the user (e.g. via IFS). template class separated_buffer_t { + public: struct element_t { StringType contents; separation_type_t separation; @@ -46,6 +47,7 @@ class separated_buffer_t { bool is_explicitly_separated() const { return separation == separation_type_t::explicitly; } }; + private: /// Limit on how much data we'll buffer. Zero means no limit. size_t buffer_limit_; @@ -236,9 +238,6 @@ class io_buffer_t : public io_pipe_t { /// Access the underlying buffer. const separated_buffer_t &buffer() const { return buffer_; } - /// Access the underlying buffer. - separated_buffer_t &buffer() { return buffer_; } - /// Function to append to the buffer. void append(const char *ptr, size_t count) { buffer_.append(ptr, ptr + count); } @@ -301,6 +300,8 @@ class output_stream_t { void append(const wcstring &s) { buffer_.append(s.begin(), s.end()); } + separated_buffer_t &buffer() { return buffer_; } + const separated_buffer_t &buffer() const { return buffer_; } void append(const wchar_t *s) { append(s, wcslen(s)); } diff --git a/tests/string.err b/tests/string.err index 98c1702b8..c7cd024fc 100644 --- a/tests/string.err +++ b/tests/string.err @@ -294,3 +294,9 @@ string repeat -l fakearg #################### # Check NUL + +#################### +# string split0 + +#################### +# string split0 in functions diff --git a/tests/string.in b/tests/string.in index cfef38b68..60dcaaf01 100644 --- a/tests/string.in +++ b/tests/string.in @@ -340,4 +340,22 @@ printf 'a\0b' | string replace -r b g | string escape # TODO: These do not yet work! # printf 'a\0b' | string match '*b' | string escape +logmsg string split0 +count (echo -ne 'abcdefghi' | string split0) +count (echo -ne 'abc\x00def\x00ghi\x00' | string split0) +count (echo -ne 'abc\x00def\x00ghi\x00\x00' | string split0) +count (echo -ne 'abc\x00def\x00ghi' | string split0) +count (echo -ne 'abc\ndef\x00ghi\x00' | string split0) +count (echo -ne 'abc\ndef\nghi' | string split0) + +logmsg string split0 in functions +# This function outputs some newline-separated content, and some +# explicitly separated content. +function dualsplit + echo alpha + echo beta + echo -ne 'gamma\x00delta' | string split0 +end +count (dualsplit) + exit 0 diff --git a/tests/string.out b/tests/string.out index 7ff40deba..bded690e7 100644 --- a/tests/string.out +++ b/tests/string.out @@ -433,3 +433,16 @@ d a\x00b a\x00g a\x00g + +#################### +# string split0 +1 +3 +4 +3 +2 +1 + +#################### +# string split0 in functions +4