Add string split0

This adds a new string command split0, which splits on zero bytes.
split0 has superpowers because its output is not further split on
newlines when used in command substitutions.
This commit is contained in:
ridiculousfish
2018-05-29 21:11:50 -07:00
parent f998afaa23
commit d34a300818
6 changed files with 115 additions and 61 deletions

View File

@@ -75,25 +75,29 @@ class arg_iterator_t {
int argidx_;
// If not using argv, a string to store bytes that have been read but not yet returned.
std::string buffer_;
// If set, when reading from a stream, split on zeros instead of newlines.
const bool split0_;
// Backing storage for the next() string.
wcstring storage_;
const io_streams_t &streams_;
/// \return the next argument from stdin
const wchar_t *get_arg_stdin() {
/// Reads the next argument from stdin, returning true if an argument was produced and false if
/// not. On true, the string is stored in storage_.
bool get_arg_stdin() {
assert(string_args_from_stdin(streams_) && "should not be reading from stdin");
// Read in chunks from fd until buffer has a line.
// Read in chunks from fd until buffer has a line (or zero if split0_ is set).
const char sep = split0_ ? '\0' : '\n';
size_t pos;
while ((pos = buffer_.find('\n')) == std::string::npos) {
while ((pos = buffer_.find(sep)) == std::string::npos) {
char buf[STRING_CHUNK_SIZE];
long n = read_blocked(streams_.stdin_fd, buf, STRING_CHUNK_SIZE);
if (n == 0) {
// If we still have buffer contents, flush them,
// in case there was no trailing '\n'.
if (buffer_.empty()) return NULL;
// in case there was no trailing sep.
if (buffer_.empty()) return false;
storage_ = str2wcstring(buffer_);
buffer_.clear();
return storage_.c_str();
return true;
}
if (n == -1) {
// Some error happened. We can't do anything about it,
@@ -101,20 +105,21 @@ class arg_iterator_t {
// (read_blocked already retries for EAGAIN and EINTR)
storage_ = str2wcstring(buffer_);
buffer_.clear();
return NULL;
return false;
}
buffer_.append(buf, n);
}
// Split the buffer on the '\n' and return the first part.
// Split the buffer on the sep and return the first part.
storage_ = str2wcstring(buffer_, pos);
buffer_.erase(0, pos + 1);
return storage_.c_str();
return true;
}
public:
arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams)
: argv_(argv), argidx_(argidx), streams_(streams) {}
arg_iterator_t(const wchar_t *const *argv, int argidx, const io_streams_t &streams,
bool split0 = false)
: argv_(argv), argidx_(argidx), split0_(split0), streams_(streams) {}
const wcstring *nextstr() {
if (string_args_from_stdin(streams_)) {
@@ -1037,7 +1042,8 @@ static int string_replace(parser_t &parser, io_streams_t &streams, int argc, wch
return replacer->replace_count() > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
static int string_split_maybe0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv,
bool is_split0) {
options_t opts;
opts.quiet_valid = true;
opts.right_valid = true;
@@ -1045,14 +1051,14 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
opts.max = LONG_MAX;
opts.no_empty_valid = true;
int optind;
int retval = parse_opts(&opts, &optind, 1, argc, argv, parser, streams);
int retval = parse_opts(&opts, &optind, is_split0 ? 0 : 1, argc, argv, parser, streams);
if (retval != STATUS_CMD_OK) return retval;
const wcstring sep(opts.arg1);
const wcstring sep = is_split0 ? wcstring(1, L'\0') : wcstring(opts.arg1);
wcstring_list_t splits;
size_t arg_count = 0;
arg_iterator_t aiter(argv, optind, streams);
arg_iterator_t aiter(argv, optind, streams, is_split0);
while (const wcstring *arg = aiter.nextstr()) {
if (opts.right) {
split_about(arg->rbegin(), arg->rend(), sep.rbegin(), sep.rend(), &splits, opts.max, opts.no_empty);
@@ -1070,15 +1076,24 @@ static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar
std::reverse(splits.begin(), splits.end());
}
const size_t split_count = splits.size();
if (!opts.quiet) {
for (wcstring_list_t::const_iterator si = splits.begin(); si != splits.end(); ++si) {
streams.out.append(*si);
streams.out.append(L'\n');
auto &buff = streams.out.buffer();
for (const wcstring &split : splits) {
buff.append(split, separation_type_t::explicitly);
}
}
// We split something if we have more split values than args.
return splits.size() > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR;
return split_count > arg_count ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
static int string_split(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
return string_split_maybe0(parser, streams, argc, argv, false /* is_split0 */);
}
static int string_split0(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
return string_split_maybe0(parser, streams, argc, argv, true /* is_split0 */);
}
// Helper function to abstract the repeat logic from string_repeat
@@ -1256,19 +1271,13 @@ static const struct string_subcommand {
wchar_t **argv); //!OCLINT(unused param)
}
string_subcommands[] = {{L"escape", &string_escape},
{L"join", &string_join},
{L"length", &string_length},
{L"match", &string_match},
{L"replace", &string_replace},
{L"split", &string_split},
{L"sub", &string_sub},
{L"trim", &string_trim},
{L"lower", &string_lower},
{L"upper", &string_upper},
{L"repeat", &string_repeat},
{L"unescape", &string_unescape},
{NULL, NULL}};
string_subcommands[] = {{L"escape", &string_escape}, {L"join", &string_join},
{L"length", &string_length}, {L"match", &string_match},
{L"replace", &string_replace}, {L"split", &string_split},
{L"split0", &string_split0}, {L"sub", &string_sub},
{L"trim", &string_trim}, {L"lower", &string_lower},
{L"upper", &string_upper}, {L"repeat", &string_repeat},
{L"unescape", &string_unescape}, {NULL, NULL}};
/// The string builtin, for manipulating strings.
int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) {

View File

@@ -1204,34 +1204,41 @@ static int exec_subshell_internal(const wcstring &cmd, wcstring_list_t *lst, boo
if (lst == NULL || io_buffer.get() == NULL) {
return subcommand_status;
}
// Walk over all the elements.
for (const auto &elem : io_buffer->buffer().elements()) {
if (elem.is_explicitly_separated()) {
// Just append this one.
lst->push_back(str2wcstring(elem.contents));
continue;
}
const std::string buffer_contents = io_buffer->buffer().newline_serialized();
const char *begin = buffer_contents.data();
const char *end = begin + buffer_contents.size();
if (split_output) {
const char *cursor = begin;
while (cursor < end) {
// Look for the next separator.
const char *stop = (const char *)memchr(cursor, '\n', end - cursor);
const bool hit_separator = (stop != NULL);
if (!hit_separator) {
// If it's not found, just use the end.
stop = end;
// Not explicitly separated. We have to split it explicitly.
assert(!elem.is_explicitly_separated() && "should not be explicitly separated");
const char *begin = elem.contents.data();
const char *end = begin + elem.contents.size();
if (split_output) {
const char *cursor = begin;
while (cursor < end) {
// Look for the next separator.
const char *stop = (const char *)memchr(cursor, '\n', end - cursor);
const bool hit_separator = (stop != NULL);
if (!hit_separator) {
// If it's not found, just use the end.
stop = end;
}
// Stop now points at the first character we do not want to copy.
lst->push_back(str2wcstring(cursor, stop - cursor));
// If we hit a separator, skip over it; otherwise we're at the end.
cursor = stop + (hit_separator ? 1 : 0);
}
// Stop now points at the first character we do not want to copy.
const wcstring wc = str2wcstring(cursor, stop - cursor);
lst->push_back(wc);
// If we hit a separator, skip over it; otherwise we're at the end.
cursor = stop + (hit_separator ? 1 : 0);
} else {
// We're not splitting output, but we still want to trim off a trailing newline.
if (end != begin && end[-1] == '\n') {
--end;
}
lst->push_back(str2wcstring(begin, end - begin));
}
} else {
// We're not splitting output, but we still want to trim off a trailing newline.
if (end != begin && end[-1] == '\n') {
--end;
}
const wcstring wc = str2wcstring(begin, end - begin);
lst->push_back(wc);
}
return subcommand_status;

View File

@@ -36,6 +36,7 @@ enum class separation_type_t {
/// others which must be separated further by the user (e.g. via IFS).
template <typename StringType>
class separated_buffer_t {
public:
struct element_t {
StringType contents;
separation_type_t separation;
@@ -46,6 +47,7 @@ class separated_buffer_t {
bool is_explicitly_separated() const { return separation == separation_type_t::explicitly; }
};
private:
/// Limit on how much data we'll buffer. Zero means no limit.
size_t buffer_limit_;
@@ -236,9 +238,6 @@ class io_buffer_t : public io_pipe_t {
/// Access the underlying buffer.
const separated_buffer_t<std::string> &buffer() const { return buffer_; }
/// Access the underlying buffer.
separated_buffer_t<std::string> &buffer() { return buffer_; }
/// Function to append to the buffer.
void append(const char *ptr, size_t count) { buffer_.append(ptr, ptr + count); }
@@ -301,6 +300,8 @@ class output_stream_t {
void append(const wcstring &s) { buffer_.append(s.begin(), s.end()); }
separated_buffer_t<wcstring> &buffer() { return buffer_; }
const separated_buffer_t<wcstring> &buffer() const { return buffer_; }
void append(const wchar_t *s) { append(s, wcslen(s)); }

View File

@@ -294,3 +294,9 @@ string repeat -l fakearg
####################
# Check NUL
####################
# string split0
####################
# string split0 in functions

View File

@@ -340,4 +340,22 @@ printf 'a\0b' | string replace -r b g | string escape
# TODO: These do not yet work!
# printf 'a\0b' | string match '*b' | string escape
logmsg string split0
count (echo -ne 'abcdefghi' | string split0)
count (echo -ne 'abc\x00def\x00ghi\x00' | string split0)
count (echo -ne 'abc\x00def\x00ghi\x00\x00' | string split0)
count (echo -ne 'abc\x00def\x00ghi' | string split0)
count (echo -ne 'abc\ndef\x00ghi\x00' | string split0)
count (echo -ne 'abc\ndef\nghi' | string split0)
logmsg string split0 in functions
# This function outputs some newline-separated content, and some
# explicitly separated content.
function dualsplit
echo alpha
echo beta
echo -ne 'gamma\x00delta' | string split0
end
count (dualsplit)
exit 0

View File

@@ -433,3 +433,16 @@ d
a\x00b
a\x00g
a\x00g
####################
# string split0
1
3
4
3
2
1
####################
# string split0 in functions
4