Add string split0

This adds a new string command split0, which splits on zero bytes.
split0 has superpowers because its output is not further split on
newlines when used in command substitutions.
This commit is contained in:
ridiculousfish
2018-05-29 21:11:50 -07:00
parent f998afaa23
commit d34a300818
6 changed files with 115 additions and 61 deletions

View File

@@ -75,25 +75,29 @@ class arg_iterator_t {
int argidx_; int argidx_;
// If not using argv, a string to store bytes that have been read but not yet returned. // If not using argv, a string to store bytes that have been read but not yet returned.
std::string buffer_; std::string buffer_;
// If set, when reading from a stream, split on zeros instead of newlines.
const bool split0_;
// Backing storage for the next() string. // Backing storage for the next() string.
wcstring storage_; wcstring storage_;
const io_streams_t &streams_; const io_streams_t &streams_;
/// \return the next argument from stdin /// Reads the next argument from stdin, returning true if an argument was produced and false if
const wchar_t *get_arg_stdin() { /// not. On true, the string is stored in storage_.
bool get_arg_stdin() {
assert(string_args_from_stdin(streams_) && "should not be reading from stdin"); assert(string_args_from_stdin(streams_) && "should not be reading from stdin");
// Read in chunks from fd until buffer has a line. // Read in chunks from fd until buffer has a line (or zero if split0_ is set).
const char sep = split0_ ? '\0' : '\n';
size_t pos; size_t pos;
while ((pos = buffer_.find('\n')) == std::string::npos) { while ((pos = buffer_.find(sep)) == std::string::npos) {
char buf[STRING_CHUNK_SIZE]; char buf[STRING_CHUNK_SIZE];
long n = read_blocked(streams_.stdin_fd, buf, STRING_CHUNK_SIZE); long n = read_blocked(streams_.stdin_fd, buf, STRING_CHUNK_SIZE);
if (n == 0) { if (n == 0) {
// If we still have buffer contents, flush them, // If we still have buffer contents, flush them,
// in case there was no trailing '\n'. // in case there was no trailing sep.
if (buffer_.empty()) return NULL; if (buffer_.empty()) return false;
storage_ = str2wcstring(buffer_); storage_ = str2wcstring(buffer_);
buffer_.clear(); buffer_.clear();
return storage_.c_str(); return true;
} }
if (n == -1) { if (n == -1) {
// Some error happened. We can't do anything about it, // Some error happened. We can't do anything about it,
@@ -101,20 +105,21 @@ class arg_iterator_t {
// (read_blocked already retries for EAGAIN and EINTR) // (read_blocked already retries for EAGAIN and EINTR)
storage_ = str2wcstring(buffer_); storage_ = str2wcstring(buffer_);
buffer_.clear(); buffer_.clear();
return NULL; return false;
} }
buffer_.append(buf, n); buffer_.append(buf, n);
} }
// Split the buffer on the '\n' and return the first part. // Split the buffer on the sep and return the first part.
storage_ = str2wcstring(buffer_, pos); storage_ = str2wcstring(buffer_, pos);
buffer_.erase(0, pos + 1); buffer_.erase(0, pos + 1);
return storage_.c_str(); return true;
} }
public: public:
arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams) arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams,
: argv_(argv), argidx_(argidx), streams_(streams) {} bool split0 = false)
: argv_(argv), argidx_(argidx), split0_(split0), streams_(streams) {}
const wcstring *nextstr() { const wcstring *nextstr() {
if (string_args_from_stdin(streams_)) { if (string_args_from_stdin(streams_)) {
@@ -1037,7 +1042,8 @@ static int string_replace(parser_t &parser, io_streams_t &streams, int argc, wch
return replacer->replace_count() > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; return replacer->replace_count() > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
} }
static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { static int string_split_maybe0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv,
bool is_split0) {
options_t opts; options_t opts;
opts.quiet_valid = true; opts.quiet_valid = true;
opts.right_valid = true; opts.right_valid = true;
@@ -1045,14 +1051,14 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
opts.max = LONG_MAX; opts.max = LONG_MAX;
opts.no_empty_valid = true; opts.no_empty_valid = true;
int optind; int optind;
int retval = parse_opts(&opts, &optind, 1, argc, argv, parser, streams); int retval = parse_opts(&opts, &optind, is_split0 ? 0 : 1, argc, argv, parser, streams);
if (retval != STATUS_CMD_OK) return retval; if (retval != STATUS_CMD_OK) return retval;
const wcstring sep(opts.arg1); const wcstring sep = is_split0 ? wcstring(1, L'\0') : wcstring(opts.arg1);
wcstring_list_t splits; wcstring_list_t splits;
size_t arg_count = 0; size_t arg_count = 0;
arg_iterator_t aiter(argv, optind, streams); arg_iterator_t aiter(argv, optind, streams, is_split0);
while (const wcstring *arg = aiter.nextstr()) { while (const wcstring *arg = aiter.nextstr()) {
if (opts.right) { if (opts.right) {
split_about(arg->rbegin(), arg->rend(), sep.rbegin(), sep.rend(), &splits, opts.max, opts.no_empty); split_about(arg->rbegin(), arg->rend(), sep.rbegin(), sep.rend(), &splits, opts.max, opts.no_empty);
@@ -1070,15 +1076,24 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
std::reverse(splits.begin(), splits.end()); std::reverse(splits.begin(), splits.end());
} }
const size_t split_count = splits.size();
if (!opts.quiet) { if (!opts.quiet) {
for (wcstring_list_t::const_iterator si = splits.begin(); si != splits.end(); ++si) { auto &buff = streams.out.buffer();
streams.out.append(*si); for (const wcstring &split : splits) {
streams.out.append(L'\n'); buff.append(split, separation_type_t::explicitly);
} }
} }
// We split something if we have more split values than args. // We split something if we have more split values than args.
return splits.size() > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR; return split_count > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
return string_split_maybe0(parser, streams, argc, argv, false /* is_split0 */);
}
static int string_split0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
return string_split_maybe0(parser, streams, argc, argv, true /* is_split0 */);
} }
// Helper function to abstract the repeat logic from string_repeat // Helper function to abstract the repeat logic from string_repeat
@@ -1256,19 +1271,13 @@ static const struct string_subcommand {
wchar_t **argv); //!OCLINT(unused param) wchar_t **argv); //!OCLINT(unused param)
} }
string_subcommands[] = {{L"escape", &string_escape}, string_subcommands[] = {{L"escape", &string_escape}, {L"join", &string_join},
{L"join", &string_join}, {L"length", &string_length}, {L"match", &string_match},
{L"length", &string_length}, {L"replace", &string_replace}, {L"split", &string_split},
{L"match", &string_match}, {L"split0", &string_split0}, {L"sub", &string_sub},
{L"replace", &string_replace}, {L"trim", &string_trim}, {L"lower", &string_lower},
{L"split", &string_split}, {L"upper", &string_upper}, {L"repeat", &string_repeat},
{L"sub", &string_sub}, {L"unescape", &string_unescape}, {NULL, NULL}};
{L"trim", &string_trim},
{L"lower", &string_lower},
{L"upper", &string_upper},
{L"repeat", &string_repeat},
{L"unescape", &string_unescape},
{NULL, NULL}};
/// The string builtin, for manipulating strings. /// The string builtin, for manipulating strings.
int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) { int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) {

View File

@@ -1204,34 +1204,41 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst, boo
if (lst == NULL || io_buffer.get() == NULL) { if (lst == NULL || io_buffer.get() == NULL) {
return subcommand_status; return subcommand_status;
} }
// Walk over all the elements.
for (const auto &elem : io_buffer->buffer().elements()) {
if (elem.is_explicitly_separated()) {
// Just append this one.
lst->push_back(str2wcstring(elem.contents));
continue;
}
const std::string buffer_contents = io_buffer->buffer().newline_serialized(); // Not explicitly separated. We have to split it explicitly.
const char *begin = buffer_contents.data(); assert(!elem.is_explicitly_separated() && "should not be explicitly separated");
const char *end = begin + buffer_contents.size(); const char *begin = elem.contents.data();
if (split_output) { const char *end = begin + elem.contents.size();
const char *cursor = begin; if (split_output) {
while (cursor < end) { const char *cursor = begin;
// Look for the next separator. while (cursor < end) {
const char *stop = (const char *)memchr(cursor, '\n', end - cursor); // Look for the next separator.
const bool hit_separator = (stop != NULL); const char *stop = (const char *)memchr(cursor, '\n', end - cursor);
if (!hit_separator) { const bool hit_separator = (stop != NULL);
// If it's not found, just use the end. if (!hit_separator) {
stop = end; // If it's not found, just use the end.
stop = end;
}
// Stop now points at the first character we do not want to copy.
lst->push_back(str2wcstring(cursor, stop - cursor));
// If we hit a separator, skip over it; otherwise we're at the end.
cursor = stop + (hit_separator ? 1 : 0);
} }
// Stop now points at the first character we do not want to copy. } else {
const wcstring wc = str2wcstring(cursor, stop - cursor); // We're not splitting output, but we still want to trim off a trailing newline.
lst->push_back(wc); if (end != begin && end[-1] == '\n') {
--end;
// If we hit a separator, skip over it; otherwise we're at the end. }
cursor = stop + (hit_separator ? 1 : 0); lst->push_back(str2wcstring(begin, end - begin));
} }
} else {
// We're not splitting output, but we still want to trim off a trailing newline.
if (end != begin && end[-1] == '\n') {
--end;
}
const wcstring wc = str2wcstring(begin, end - begin);
lst->push_back(wc);
} }
return subcommand_status; return subcommand_status;

View File

@@ -36,6 +36,7 @@ enum class separation_type_t {
/// others which must be separated further by the user (e.g. via IFS). /// others which must be separated further by the user (e.g. via IFS).
template <typename StringType> template <typename StringType>
class separated_buffer_t { class separated_buffer_t {
public:
struct element_t { struct element_t {
StringType contents; StringType contents;
separation_type_t separation; separation_type_t separation;
@@ -46,6 +47,7 @@ class separated_buffer_t {
bool is_explicitly_separated() const { return separation == separation_type_t::explicitly; } bool is_explicitly_separated() const { return separation == separation_type_t::explicitly; }
}; };
private:
/// Limit on how much data we'll buffer. Zero means no limit. /// Limit on how much data we'll buffer. Zero means no limit.
size_t buffer_limit_; size_t buffer_limit_;
@@ -236,9 +238,6 @@ class io_buffer_t : public io_pipe_t {
/// Access the underlying buffer. /// Access the underlying buffer.
const separated_buffer_t<std::string> &buffer() const { return buffer_; } const separated_buffer_t<std::string> &buffer() const { return buffer_; }
/// Access the underlying buffer.
separated_buffer_t<std::string> &buffer() { return buffer_; }
/// Function to append to the buffer. /// Function to append to the buffer.
void append(const char *ptr, size_t count) { buffer_.append(ptr, ptr + count); } void append(const char *ptr, size_t count) { buffer_.append(ptr, ptr + count); }
@@ -301,6 +300,8 @@ class output_stream_t {
void append(const wcstring &s) { buffer_.append(s.begin(), s.end()); } void append(const wcstring &s) { buffer_.append(s.begin(), s.end()); }
separated_buffer_t<wcstring> &buffer() { return buffer_; }
const separated_buffer_t<wcstring> &buffer() const { return buffer_; } const separated_buffer_t<wcstring> &buffer() const { return buffer_; }
void append(const wchar_t *s) { append(s, wcslen(s)); } void append(const wchar_t *s) { append(s, wcslen(s)); }

View File

@@ -294,3 +294,9 @@ string repeat -l fakearg
#################### ####################
# Check NUL # Check NUL
####################
# string split0
####################
# string split0 in functions

View File

@@ -340,4 +340,22 @@ printf 'a\0b' | string replace -r b g | string escape
# TODO: These do not yet work! # TODO: These do not yet work!
# printf 'a\0b' | string match '*b' | string escape # printf 'a\0b' | string match '*b' | string escape
logmsg string split0
count (echo -ne 'abcdefghi' | string split0)
count (echo -ne 'abc\x00def\x00ghi\x00' | string split0)
count (echo -ne 'abc\x00def\x00ghi\x00\x00' | string split0)
count (echo -ne 'abc\x00def\x00ghi' | string split0)
count (echo -ne 'abc\ndef\x00ghi\x00' | string split0)
count (echo -ne 'abc\ndef\nghi' | string split0)
logmsg string split0 in functions
# This function outputs some newline-separated content, and some
# explicitly separated content.
function dualsplit
echo alpha
echo beta
echo -ne 'gamma\x00delta' | string split0
end
count (dualsplit)
exit 0 exit 0

View File

@@ -433,3 +433,16 @@ d
a\x00b a\x00b
a\x00g a\x00g
a\x00g a\x00g
####################
# string split0
1
3
4
3
2
1
####################
# string split0 in functions
4