Remove wcstring_tok

wcstring_tok was a funky function which was confusing and used only in one place. Replace it with split_string_tok, which is somewhat simpler.
2026-05-26 16:01:15 -03:00 · 2021-04-18 14:46:05 -07:00
parent 2fb0a703de
commit 092168485b
4 changed files with 66 additions and 76 deletions
--- a/src/builtin_read.cpp
+++ b/src/builtin_read.cpp
@@ -594,13 +594,8 @@ maybe_t<int> builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t
            if (!opts.have_delimiter) {
                // We're using IFS, so tokenize the buffer using each IFS char. This is for backward
                // compatibility with old versions of fish.
-                wcstring_list_t tokens;
-
-                for (wcstring_range loc = wcstring_tok(buff, opts.delimiter);
-                     loc.first != wcstring::npos; loc = wcstring_tok(buff, opts.delimiter, loc)) {
-                    tokens.emplace_back(wcstring(buff, loc.first, loc.second));
-                }
-                parser.set_var_and_fire(*var_ptr++, opts.place, tokens);
+                wcstring_list_t tokens = split_string_tok(buff, opts.delimiter);
+                parser.set_var_and_fire(*var_ptr++, opts.place, std::move(tokens));
            } else {
                // We're using a delimiter provided by the user so use the `string split` behavior.
                wcstring_list_t splits;
@@ -614,14 +609,15 @@ maybe_t<int> builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t
            if (!opts.have_delimiter) {
                // We're using IFS, so tokenize the buffer using each IFS char. This is for backward
                // compatibility with old versions of fish.
-                wcstring_range loc = wcstring_range(0, 0);
+                // Note the final variable gets any remaining text.
+                wcstring_list_t var_vals = split_string_tok(buff, opts.delimiter, vars_left());
+                size_t val_idx = 0;
                while (vars_left()) {
-                    wcstring substr;
-                    loc = wcstring_tok(buff, (vars_left() > 1) ? opts.delimiter : wcstring(), loc);
-                    if (loc.first != wcstring::npos) {
-                        substr = wcstring(buff, loc.first, loc.second);
+                    wcstring val;
+                    if (val_idx < var_vals.size()) {
+                        val = std::move(var_vals.at(val_idx++));
                    }
-                    parser.set_var_and_fire(*var_ptr++, opts.place, substr);
+                    parser.set_var_and_fire(*var_ptr++, opts.place, std::move(val));
                }
            } else {
                // We're using a delimiter provided by the user so use the `string split` behavior.
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -5430,31 +5430,26 @@ static void test_highlighting() {
    vars.remove(L"VARIABLE_IN_COMMAND2", ENV_DEFAULT);
 }

-static void test_wcstring_tok() {
-    say(L"Testing wcstring_tok");
-    wcstring buff = L"hello world";
-    wcstring needle = L" \t\n";
-    wcstring_range loc = wcstring_tok(buff, needle);
-    if (loc.first == wcstring::npos || buff.substr(loc.first, loc.second) != L"hello") {
-        err(L"Wrong results from first wcstring_tok(): {%zu, %zu}", loc.first, loc.second);
-    }
-    loc = wcstring_tok(buff, needle, loc);
-    if (loc.first == wcstring::npos || buff.substr(loc.first, loc.second) != L"world") {
-        err(L"Wrong results from second wcstring_tok(): {%zu, %zu}", loc.first, loc.second);
-    }
-    loc = wcstring_tok(buff, needle, loc);
-    if (loc.first != wcstring::npos) {
-        err(L"Wrong results from third wcstring_tok(): {%zu, %zu}", loc.first, loc.second);
-    }
+static void test_split_string_tok() {
+    say(L"Testing split_string_tok");
+    wcstring_list_t splits;
+    splits = split_string_tok(L" hello \t   world", L" \t\n");
+    do_test((splits == wcstring_list_t{L"hello", L"world"}));

-    buff = L"hello world";
-    loc = wcstring_tok(buff, needle);
-    // loc is "hello" again
-    loc = wcstring_tok(buff, L"", loc);
-    if (loc.first == wcstring::npos || buff.substr(loc.first, loc.second) != L"world") {
-        err(L"Wrong results from wcstring_tok with empty needle: {%zu, %zu}", loc.first,
-            loc.second);
-    }
+    splits = split_string_tok(L" stuff ", wcstring(L" "), 0);
+    do_test((splits == wcstring_list_t{}));
+
+    splits = split_string_tok(L" stuff ", wcstring(L" "), 1);
+    do_test((splits == wcstring_list_t{L" stuff "}));
+
+    splits = split_string_tok(L" hello \t   world  andstuff ", L" \t\n", 3);
+    do_test((splits == wcstring_list_t{L"hello", L"world", L" andstuff "}));
+
+    // NUL chars are OK.
+    wcstring nullstr = L" hello X  world";
+    nullstr.at(nullstr.find(L'X')) = L'\0';
+    splits = split_string_tok(nullstr, wcstring(L" \0", 2));
+    do_test((splits == wcstring_list_t{L"hello", L"world"}));
 }

 static void test_wwrite_to_fd() {
@@ -6521,7 +6516,7 @@ int main(int argc, char **argv) {
    env_stack_t::principal().set_pwd_from_getcwd();

    if (should_test_function("utility_functions")) test_utility_functions();
-    if (should_test_function("wcstring_tok")) test_wcstring_tok();
+    if (should_test_function("string_split")) test_split_string_tok();
    if (should_test_function("wwrite_to_fd")) test_wwrite_to_fd();
    if (should_test_function("env_vars")) test_env_vars();
    if (should_test_function("env")) test_env_snapshot();
--- a/src/wcstringutil.cpp
+++ b/src/wcstringutil.cpp
@@ -10,31 +10,6 @@
 #include "common.h"
 #include "flog.h"

-wcstring_range wcstring_tok(wcstring &str, const wcstring &needle, wcstring_range last) {
-    using size_type = wcstring::size_type;
-    size_type pos = last.second == wcstring::npos ? wcstring::npos : last.first;
-    if (pos != wcstring::npos && last.second != wcstring::npos) pos += last.second;
-    if (pos != wcstring::npos && pos != 0) ++pos;
-    if (pos == wcstring::npos || pos >= str.size()) {
-        return std::make_pair(wcstring::npos, wcstring::npos);
-    }
-
-    if (needle.empty()) {
-        return std::make_pair(pos, wcstring::npos);
-    }
-
-    pos = str.find_first_not_of(needle, pos);
-    if (pos == wcstring::npos) return std::make_pair(wcstring::npos, wcstring::npos);
-
-    size_type next_pos = str.find_first_of(needle, pos);
-    if (next_pos == wcstring::npos) {
-        return std::make_pair(pos, wcstring::npos);
-    }
-
-    str[next_pos] = L'\0';
-    return std::make_pair(pos, next_pos - pos);
-}
-
 wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) {
    if (input.size() <= static_cast<size_t>(max_len)) {
        return input;
@@ -282,6 +257,33 @@ wcstring_list_t split_string(const wcstring &val, wchar_t sep) {
    return out;
 }

+wcstring_list_t split_string_tok(const wcstring &val, const wcstring &seps, size_t max_results) {
+    wcstring_list_t out;
+    size_t end = val.size();
+    size_t pos = 0;
+    while (pos < end && out.size() + 1 < max_results) {
+        // Skip leading seps.
+        pos = val.find_first_not_of(seps, pos);
+        if (pos == wcstring::npos) break;
+
+        // Find next sep.
+        size_t next_sep = val.find_first_of(seps, pos);
+        if (next_sep == wcstring::npos) {
+            next_sep = end;
+        }
+        out.emplace_back(val, pos, next_sep - pos);
+        // Note we skip exactly one sep here. This is because on the last iteration we retain all
+        // but the first leading separators. This is historical.
+        pos = next_sep + 1;
+    }
+    if (pos < end && max_results > 0) {
+        assert(out.size() + 1 == max_results && "Should have split the max");
+        out.emplace_back(val, pos);
+    }
+    assert(out.size() <= max_results && "Got too many results");
+    return out;
+}
+
 wcstring join_strings(const wcstring_list_t &vals, wchar_t sep) {
    if (vals.empty()) return wcstring{};

--- a/src/wcstringutil.h
+++ b/src/wcstringutil.h
@@ -120,6 +120,16 @@ inline maybe_t<string_fuzzy_match_t> string_fuzzy_match_string(const wcstring &s
 /// Split a string by a separator character.
 wcstring_list_t split_string(const wcstring &val, wchar_t sep);

+/// Split a string by runs of any of the separator characters provided in \p seps.
+/// Note the delimiters are the characters in \p seps, not \p seps itself.
+/// \p seps may contain the NUL character.
+/// Do not output more than \p max_results results. If we are to output exactly that much,
+/// the last output is the the remainder of the input, including leading delimiters,
+/// except for the first. This is historical behavior.
+/// Example: split_string_tok(" a  b   c ", " ", 3) -> {"a", "b", "  c  "}
+wcstring_list_t split_string_tok(const wcstring &val, const wcstring &seps,
+                                 size_t max_results = std::numeric_limits<size_t>::max());
+
 /// Join a list of strings by a separator character.
 wcstring join_strings(const wcstring_list_t &vals, wchar_t sep);

@@ -157,19 +167,6 @@ inline bool bool_from_string(const wcstring &x) {
    return !x.empty() && std::wcschr(L"YTyt1", x.at(0));
 }

-/// @typedef wcstring_range represents a range in a wcstring.
-/// The first element is the location, the second is the count.
-typedef std::pair<wcstring::size_type, wcstring::size_type> wcstring_range;
-
-/// wcstring equivalent of wcstok(). Supports NUL. For convenience and wcstok() compatibility, the
-/// first character of each token separator is replaced with NUL.
-/// @return Returns a pair of (pos, count).
-///         This will be (npos, npos) when it's done. In the form of (pos, npos)
-///         when the token is already known to be the final token.
-/// @note The final token may not necessarily return (pos, npos).
-wcstring_range wcstring_tok(wcstring &str, const wcstring &needle,
-                            wcstring_range last = wcstring_range(0, 0));
-
 /// Given iterators into a string (forward or reverse), splits the haystack iterators
 /// about the needle sequence, up to max times. Inserts splits into the output array.
 /// If the iterators are forward, this does the normal thing.