diff --git a/src/builtin_read.cpp b/src/builtin_read.cpp index f76c2ad60..56e628f1e 100644 --- a/src/builtin_read.cpp +++ b/src/builtin_read.cpp @@ -594,13 +594,8 @@ maybe_t builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t if (!opts.have_delimiter) { // We're using IFS, so tokenize the buffer using each IFS char. This is for backward // compatibility with old versions of fish. - wcstring_list_t tokens; - - for (wcstring_range loc = wcstring_tok(buff, opts.delimiter); - loc.first != wcstring::npos; loc = wcstring_tok(buff, opts.delimiter, loc)) { - tokens.emplace_back(wcstring(buff, loc.first, loc.second)); - } - parser.set_var_and_fire(*var_ptr++, opts.place, tokens); + wcstring_list_t tokens = split_string_tok(buff, opts.delimiter); + parser.set_var_and_fire(*var_ptr++, opts.place, std::move(tokens)); } else { // We're using a delimiter provided by the user so use the `string split` behavior. wcstring_list_t splits; @@ -614,14 +609,15 @@ maybe_t builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t if (!opts.have_delimiter) { // We're using IFS, so tokenize the buffer using each IFS char. This is for backward // compatibility with old versions of fish. - wcstring_range loc = wcstring_range(0, 0); + // Note the final variable gets any remaining text. + wcstring_list_t var_vals = split_string_tok(buff, opts.delimiter, vars_left()); + size_t val_idx = 0; while (vars_left()) { - wcstring substr; - loc = wcstring_tok(buff, (vars_left() > 1) ? opts.delimiter : wcstring(), loc); - if (loc.first != wcstring::npos) { - substr = wcstring(buff, loc.first, loc.second); + wcstring val; + if (val_idx < var_vals.size()) { + val = std::move(var_vals.at(val_idx++)); } - parser.set_var_and_fire(*var_ptr++, opts.place, substr); + parser.set_var_and_fire(*var_ptr++, opts.place, std::move(val)); } } else { // We're using a delimiter provided by the user so use the `string split` behavior. diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index cd45d4902..07463e916 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -5430,31 +5430,26 @@ static void test_highlighting() { vars.remove(L"VARIABLE_IN_COMMAND2", ENV_DEFAULT); } -static void test_wcstring_tok() { - say(L"Testing wcstring_tok"); - wcstring buff = L"hello world"; - wcstring needle = L" \t\n"; - wcstring_range loc = wcstring_tok(buff, needle); - if (loc.first == wcstring::npos || buff.substr(loc.first, loc.second) != L"hello") { - err(L"Wrong results from first wcstring_tok(): {%zu, %zu}", loc.first, loc.second); - } - loc = wcstring_tok(buff, needle, loc); - if (loc.first == wcstring::npos || buff.substr(loc.first, loc.second) != L"world") { - err(L"Wrong results from second wcstring_tok(): {%zu, %zu}", loc.first, loc.second); - } - loc = wcstring_tok(buff, needle, loc); - if (loc.first != wcstring::npos) { - err(L"Wrong results from third wcstring_tok(): {%zu, %zu}", loc.first, loc.second); - } +static void test_split_string_tok() { + say(L"Testing split_string_tok"); + wcstring_list_t splits; + splits = split_string_tok(L" hello \t world", L" \t\n"); + do_test((splits == wcstring_list_t{L"hello", L"world"})); - buff = L"hello world"; - loc = wcstring_tok(buff, needle); - // loc is "hello" again - loc = wcstring_tok(buff, L"", loc); - if (loc.first == wcstring::npos || buff.substr(loc.first, loc.second) != L"world") { - err(L"Wrong results from wcstring_tok with empty needle: {%zu, %zu}", loc.first, - loc.second); - } + splits = split_string_tok(L" stuff ", wcstring(L" "), 0); + do_test((splits == wcstring_list_t{})); + + splits = split_string_tok(L" stuff ", wcstring(L" "), 1); + do_test((splits == wcstring_list_t{L" stuff "})); + + splits = split_string_tok(L" hello \t world andstuff ", L" \t\n", 3); + do_test((splits == wcstring_list_t{L"hello", L"world", L" andstuff "})); + + // NUL chars are OK. + wcstring nullstr = L" hello X world"; + nullstr.at(nullstr.find(L'X')) = L'\0'; + splits = split_string_tok(nullstr, wcstring(L" \0", 2)); + do_test((splits == wcstring_list_t{L"hello", L"world"})); } static void test_wwrite_to_fd() { @@ -6521,7 +6516,7 @@ int main(int argc, char **argv) { env_stack_t::principal().set_pwd_from_getcwd(); if (should_test_function("utility_functions")) test_utility_functions(); - if (should_test_function("wcstring_tok")) test_wcstring_tok(); + if (should_test_function("string_split")) test_split_string_tok(); if (should_test_function("wwrite_to_fd")) test_wwrite_to_fd(); if (should_test_function("env_vars")) test_env_vars(); if (should_test_function("env")) test_env_snapshot(); diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp index df246a503..fd7c18721 100644 --- a/src/wcstringutil.cpp +++ b/src/wcstringutil.cpp @@ -10,31 +10,6 @@ #include "common.h" #include "flog.h" -wcstring_range wcstring_tok(wcstring &str, const wcstring &needle, wcstring_range last) { - using size_type = wcstring::size_type; - size_type pos = last.second == wcstring::npos ? wcstring::npos : last.first; - if (pos != wcstring::npos && last.second != wcstring::npos) pos += last.second; - if (pos != wcstring::npos && pos != 0) ++pos; - if (pos == wcstring::npos || pos >= str.size()) { - return std::make_pair(wcstring::npos, wcstring::npos); - } - - if (needle.empty()) { - return std::make_pair(pos, wcstring::npos); - } - - pos = str.find_first_not_of(needle, pos); - if (pos == wcstring::npos) return std::make_pair(wcstring::npos, wcstring::npos); - - size_type next_pos = str.find_first_of(needle, pos); - if (next_pos == wcstring::npos) { - return std::make_pair(pos, wcstring::npos); - } - - str[next_pos] = L'\0'; - return std::make_pair(pos, next_pos - pos); -} - wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) { if (input.size() <= static_cast(max_len)) { return input; @@ -282,6 +257,33 @@ wcstring_list_t split_string(const wcstring &val, wchar_t sep) { return out; } +wcstring_list_t split_string_tok(const wcstring &val, const wcstring &seps, size_t max_results) { + wcstring_list_t out; + size_t end = val.size(); + size_t pos = 0; + while (pos < end && out.size() + 1 < max_results) { + // Skip leading seps. + pos = val.find_first_not_of(seps, pos); + if (pos == wcstring::npos) break; + + // Find next sep. + size_t next_sep = val.find_first_of(seps, pos); + if (next_sep == wcstring::npos) { + next_sep = end; + } + out.emplace_back(val, pos, next_sep - pos); + // Note we skip exactly one sep here. This is because on the last iteration we retain all + // but the first leading separators. This is historical. + pos = next_sep + 1; + } + if (pos < end && max_results > 0) { + assert(out.size() + 1 == max_results && "Should have split the max"); + out.emplace_back(val, pos); + } + assert(out.size() <= max_results && "Got too many results"); + return out; +} + wcstring join_strings(const wcstring_list_t &vals, wchar_t sep) { if (vals.empty()) return wcstring{}; diff --git a/src/wcstringutil.h b/src/wcstringutil.h index 5a99224fa..7ffb2a79c 100644 --- a/src/wcstringutil.h +++ b/src/wcstringutil.h @@ -120,6 +120,16 @@ inline maybe_t string_fuzzy_match_string(const wcstring &s /// Split a string by a separator character. wcstring_list_t split_string(const wcstring &val, wchar_t sep); +/// Split a string by runs of any of the separator characters provided in \p seps. +/// Note the delimiters are the characters in \p seps, not \p seps itself. +/// \p seps may contain the NUL character. +/// Do not output more than \p max_results results. If we are to output exactly that much, +/// the last output is the the remainder of the input, including leading delimiters, +/// except for the first. This is historical behavior. +/// Example: split_string_tok(" a b c ", " ", 3) -> {"a", "b", " c "} +wcstring_list_t split_string_tok(const wcstring &val, const wcstring &seps, + size_t max_results = std::numeric_limits::max()); + /// Join a list of strings by a separator character. wcstring join_strings(const wcstring_list_t &vals, wchar_t sep); @@ -157,19 +167,6 @@ inline bool bool_from_string(const wcstring &x) { return !x.empty() && std::wcschr(L"YTyt1", x.at(0)); } -/// @typedef wcstring_range represents a range in a wcstring. -/// The first element is the location, the second is the count. -typedef std::pair wcstring_range; - -/// wcstring equivalent of wcstok(). Supports NUL. For convenience and wcstok() compatibility, the -/// first character of each token separator is replaced with NUL. -/// @return Returns a pair of (pos, count). -/// This will be (npos, npos) when it's done. In the form of (pos, npos) -/// when the token is already known to be the final token. -/// @note The final token may not necessarily return (pos, npos). -wcstring_range wcstring_tok(wcstring &str, const wcstring &needle, - wcstring_range last = wcstring_range(0, 0)); - /// Given iterators into a string (forward or reverse), splits the haystack iterators /// about the needle sequence, up to max times. Inserts splits into the output array. /// If the iterators are forward, this does the normal thing.