From b13ee818d281b7bdf91ddfdcc504a501ba14b50c Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Mon, 19 Feb 2018 15:10:10 -0800
Subject: [PATCH] Some early cleanup of tokenizer

Prior to this the tokenizer ran "one ahead", where tokenizer_t::next()
would in fact return the last-parsed token. Switch to parsing on demand
instead of running one ahead; this is simpler and prepares for tokenizer
changes.
---
 src/builtin_commandline.cpp |  5 +--
 src/fish_tests.cpp          | 32 +++++++++++++--
 src/tokenizer.cpp           | 79 +++++++++++++------------------------
 src/tokenizer.h             | 30 +++++++-------
 4 files changed, 74 insertions(+), 72 deletions(-)
diff --git a/src/builtin_commandline.cpp b/src/builtin_commandline.cpp
index 27abd183d..2586e5a15 100644
--- a/src/builtin_commandline.cpp
+++ b/src/builtin_commandline.cpp
@@ -137,10 +137,10 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
     size_t pos = get_cursor_pos() - (begin - get_buffer());
 
     if (tokenize) {
-        wchar_t *buff = wcsndup(begin, end - begin);
         // fwprintf( stderr, L"Subshell: %ls, end char %lc\n", buff, *end );
         wcstring out;
-        tokenizer_t tok(buff, TOK_ACCEPT_UNFINISHED);
+        wcstring buff(begin, end - begin);
+        tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED);
         tok_t token;
         while (tok.next(&token)) {
             if ((cut_at_cursor) && (token.offset + token.text.size() >= pos)) break;
@@ -154,7 +154,6 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
         }
 
         streams.out.append(out);
-        free(buff);
     } else {
         if (cut_at_cursor) {
             streams.out.append(begin, pos);
diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index 63e8ef43d..6904a3e2c 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -509,6 +509,29 @@ static void test_tokenizer() {
     say(L"Testing tokenizer");
     tok_t token;
 
+    {
+        bool got = false;
+        const wchar_t *str = L"alpha beta";
+        tokenizer_t t(str, 0);
+
+        got = t.next(&token);  // alpha
+        do_test(got);
+        do_test(token.type == TOK_STRING);
+        do_test(token.offset == 0);
+        do_test(token.length == 5);
+        do_test(token.text == L"alpha");
+
+        got = t.next(&token);  // beta
+        do_test(got);
+        do_test(token.type == TOK_STRING);
+        do_test(token.offset == 6);
+        do_test(token.length == 4);
+        do_test(token.text == L"beta");
+
+        got = t.next(&token);
+        do_test(!got);
+    }
+
     const wchar_t *str =
         L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells "
         L"){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect "
@@ -524,14 +547,17 @@ static void test_tokenizer() {
         tokenizer_t t(str, 0);
         size_t i = 0;
         while (t.next(&token)) {
-            if (i > sizeof types / sizeof *types) {
+            if (i >= sizeof types / sizeof *types) {
                 err(L"Too many tokens returned from tokenizer");
+                fwprintf(stdout, L"Got excess token type %ld\n", (long)token.type);
                 break;
             }
             if (types[i] != token.type) {
                 err(L"Tokenization error:");
-                fwprintf(stdout, L"Token number %zu of string \n'%ls'\n, got token type %ld\n",
-                         i + 1, str, (long)token.type);
+                fwprintf(stdout,
+                         L"Token number %zu of string \n'%ls'\n, expected type %ld, got token type "
+                         L"%ld\n",
+                         i + 1, str, (long)types[i], (long)token.type);
             }
             i++;
         }
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index dccbd8191..c777e11ca 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -45,41 +45,27 @@ void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *whe
                              const wchar_t *error_message) {
     this->last_type = TOK_ERROR;
     this->error = error_type;
-    this->global_error_offset = where ? where - this->orig_buff : 0;
+    this->global_error_offset = where ? where - this->start : 0;
     this->last_token = error_message;
+    this->has_next = false;
 }
 
-tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags)
-    : buff(b),
-      orig_buff(b),
-      last_type(TOK_NONE),
-      last_pos(0),
-      has_next(false),
-      accept_unfinished(false),
-      show_comments(false),
-      show_blank_lines(false),
-      error(TOK_ERROR_NONE),
-      global_error_offset(-1),
-      squash_errors(false),
-      continue_line_after_comment(false) {
-    assert(b != NULL);
+tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) {
+    assert(start != nullptr && "Invalid start");
 
     this->accept_unfinished = static_cast<bool>(flags & TOK_ACCEPT_UNFINISHED);
     this->show_comments = static_cast<bool>(flags & TOK_SHOW_COMMENTS);
     this->squash_errors = static_cast<bool>(flags & TOK_SQUASH_ERRORS);
     this->show_blank_lines = static_cast<bool>(flags & TOK_SHOW_BLANK_LINES);
-
-    this->has_next = (*b != L'\0');
-    this->tok_next();
 }
 
 bool tokenizer_t::next(struct tok_t *result) {
     assert(result != NULL);
-    if (!this->has_next) {
+    if (!this->tok_next()) {
         return false;
     }
 
-    const size_t current_pos = this->buff - this->orig_buff;
+    const size_t current_pos = this->buff - this->start;
 
     // We want to copy our last_token into result->text. If we just do this naively via =, we are
     // liable to trigger std::string's CoW implementation: result->text's storage will be
@@ -92,7 +78,7 @@ bool tokenizer_t::next(struct tok_t *result) {
     result->type = this->last_type;
     result->offset = this->last_pos;
     result->error = this->last_type == TOK_ERROR ? this->error : TOK_ERROR_NONE;
-    assert(this->buff >= this->orig_buff);
+    assert(this->buff >= this->start);
 
     // Compute error offset.
     result->error_offset = 0;
@@ -101,10 +87,8 @@ bool tokenizer_t::next(struct tok_t *result) {
         result->error_offset = this->global_error_offset - this->last_pos;
     }
 
-    assert(this->buff >= this->orig_buff);
+    assert(this->buff >= this->start);
     result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
-
-    this->tok_next();
     return true;
 }
 
@@ -149,7 +133,7 @@ void tokenizer_t::read_string() {
     size_t paran_offsets[paran_offsets_max];
     // Where the open bracket is.
     size_t offset_of_bracket = 0;
-    const wchar_t *const start = this->buff;
+    const wchar_t *const buff_start = this->buff;
     bool is_first = true;
 
     enum tok_mode_t {
@@ -186,14 +170,14 @@ void tokenizer_t::read_string() {
                     switch (*this->buff) {
                         case L'(': {
                             paran_count = 1;
-                            paran_offsets[0] = this->buff - this->orig_buff;
+                            paran_offsets[0] = this->buff - this->start;
                             mode = mode_subshell;
                             break;
                         }
                         case L'[': {
-                            if (this->buff != start) {
+                            if (this->buff != buff_start) {
                                 mode = mode_array_brackets;
-                                offset_of_bracket = this->buff - this->orig_buff;
+                                offset_of_bracket = this->buff - this->start;
                             }
                             break;
                         }
@@ -247,7 +231,7 @@ void tokenizer_t::read_string() {
                         }
                         case L'(': {
                             if (paran_count < paran_offsets_max) {
-                                paran_offsets[paran_count] = this->buff - this->orig_buff;
+                                paran_offsets[paran_count] = this->buff - this->start;
                             }
                             paran_count++;
                             break;
@@ -277,7 +261,7 @@ void tokenizer_t::read_string() {
                     switch (*this->buff) {
                         case L'(': {
                             paran_count = 1;
-                            paran_offsets[0] = this->buff - this->orig_buff;
+                            paran_offsets[0] = this->buff - this->start;
                             mode = mode_array_brackets_and_subshell;
                             break;
                         }
@@ -315,13 +299,13 @@ void tokenizer_t::read_string() {
                 }
 
                 TOK_CALL_ERROR(this, TOK_UNTERMINATED_SUBSHELL, PARAN_ERROR,
-                               this->orig_buff + offset_of_open_paran);
+                               this->start + offset_of_open_paran);
                 break;
             }
             case mode_array_brackets:
             case mode_array_brackets_and_subshell: {
                 TOK_CALL_ERROR(this, TOK_UNTERMINATED_SLICE, SQUARE_BRACKET_ERROR,
-                               this->orig_buff + offset_of_bracket);
+                               this->start + offset_of_bracket);
                 break;
             }
             default: {
@@ -332,9 +316,9 @@ void tokenizer_t::read_string() {
         return;
     }
 
-    len = this->buff - start;
+    len = this->buff - buff_start;
 
-    this->last_token.assign(start, len);
+    this->last_token.assign(buff_start, len);
     this->last_type = TOK_STRING;
 }
 
@@ -483,16 +467,9 @@ int oflags_for_redirection_type(enum token_type type) {
 /// to be whitespace.
 static bool my_iswspace(wchar_t c) { return c != L'\n' && iswspace(c); }
 
-void tokenizer_t::tok_next() {
-    if (this->last_type == TOK_ERROR) {
-        this->has_next = false;
-        return;
-    }
-
+bool tokenizer_t::tok_next() {
     if (!this->has_next) {
-        // fwprintf(stdout, L"EOL\n" );
-        this->last_type = TOK_END;
-        return;
+        return false;
     }
 
     while (1) {
@@ -508,29 +485,28 @@ void tokenizer_t::tok_next() {
 
     while (*this->buff == L'#') {
         if (this->show_comments) {
-            this->last_pos = this->buff - this->orig_buff;
+            this->last_pos = this->buff - this->start;
             this->read_comment();
 
             if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
-            return;
+            return true;
         }
 
-        while (*(this->buff) != L'\n' && *(this->buff) != L'\0') this->buff++;
+        while (*this->buff != L'\n' && *this->buff != L'\0') this->buff++;
         if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
-        while (my_iswspace(*(this->buff))) this->buff++;
+        while (my_iswspace(*this->buff)) this->buff++;
     }
 
     this->continue_line_after_comment = false;
 
-    this->last_pos = this->buff - this->orig_buff;
+    this->last_pos = this->buff - this->start;
 
     switch (*this->buff) {
         case L'\0': {
             this->last_type = TOK_END;
-            // fwprintf( stderr, L"End of string\n" );
             this->has_next = false;
             this->last_token.clear();
-            break;
+            return false;
         }
         case L'\r':  // carriage-return
         case L'\n':  // newline
@@ -604,11 +580,12 @@ void tokenizer_t::tok_next() {
             break;
         }
     }
+    return true;
 }
 
 wcstring tok_first(const wcstring &str) {
     wcstring result;
-    tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS);
+    tokenizer_t t(str.data(), TOK_SQUASH_ERRORS);
     tok_t token;
     if (t.next(&token) && token.type == TOK_STRING) {
         result = std::move(token.text);
diff --git a/src/tokenizer.h b/src/tokenizer.h
index c87a696b2..f870aaff9 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -71,41 +71,41 @@ struct tok_t {
 /// The tokenizer struct.
 class tokenizer_t {
     // No copying, etc.
-    tokenizer_t(const tokenizer_t &);
-    void operator=(const tokenizer_t &);
+    tokenizer_t(const tokenizer_t &) = delete;
+    void operator=(const tokenizer_t &) = delete;
 
     /// A pointer into the original string, showing where the next token begins.
     const wchar_t *buff;
-    /// A copy of the original string.
-    const wchar_t *orig_buff;
+    /// The start of the original string.
+    const wchar_t *const start;
     /// The last token.
     wcstring last_token;
     /// Type of last token.
-    enum token_type last_type;
+    enum token_type last_type { TOK_NONE };
     /// Offset of last token.
-    size_t last_pos;
+    size_t last_pos{0};
     /// Whether there are more tokens.
-    bool has_next;
+    bool has_next{true};
     /// Whether incomplete tokens are accepted.
-    bool accept_unfinished;
+    bool accept_unfinished{false};
     /// Whether comments should be returned.
-    bool show_comments;
+    bool show_comments{false};
     /// Whether all blank lines are returned.
-    bool show_blank_lines;
+    bool show_blank_lines{false};
     /// Last error.
-    tokenizer_error error;
+    tokenizer_error error{TOK_ERROR_NONE};
     /// Last error offset, in "global" coordinates (relative to orig_buff).
-    size_t global_error_offset;
+    size_t global_error_offset{size_t(-1)};
     /// Whether we are squashing errors.
-    bool squash_errors;
+    bool squash_errors{false};
     /// Whether to continue the previous line after the comment.
-    bool continue_line_after_comment;
+    bool continue_line_after_comment{false};
 
     void call_error(enum tokenizer_error error_type, const wchar_t *where,
                     const wchar_t *error_message);
     void read_string();
     void read_comment();
-    void tok_next();
+    bool tok_next();
 
    public:
     /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and