Switch tokenizer_error back to just an error code

Rather than having tokenizer_error as pointers to objects, switch it back to just an error code value. This makes reasoning about it easier since it's immutable values instead of mutable objects, and it avoids allocation during startup.
2026-05-28 09:31:16 -03:00 · 2018-09-27 21:25:49 -04:00
parent f28f9792b3
commit cc99e8d510
5 changed files with 112 additions and 81 deletions
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -575,7 +575,7 @@ static void test_tokenizer() {
        tokenizer_t t(L"abc\\", 0);
        do_test(t.next(&token));
        do_test(token.type == TOK_ERROR);
-        do_test(token.error == TOK_UNTERMINATED_ESCAPE);
+        do_test(token.error == tokenizer_error_t::unterminated_escape);
        do_test(token.error_offset == 3);
    }

@@ -584,7 +584,7 @@ static void test_tokenizer() {
        do_test(t.next(&token));
        do_test(t.next(&token));
        do_test(token.type == TOK_ERROR);
-        do_test(token.error == TOK_CLOSING_UNOPENED_SUBSHELL);
+        do_test(token.error == tokenizer_error_t::closing_unopened_subshell);
        do_test(token.error_offset == 4);
    }

@@ -593,7 +593,7 @@ static void test_tokenizer() {
        do_test(t.next(&token));
        do_test(t.next(&token));
        do_test(token.type == TOK_ERROR);
-        do_test(token.error == TOK_UNTERMINATED_SUBSHELL);
+        do_test(token.error == tokenizer_error_t::unterminated_subshell);
        do_test(token.error_offset == 4);
    }

@@ -602,7 +602,7 @@ static void test_tokenizer() {
        do_test(t.next(&token));
        do_test(t.next(&token));
        do_test(token.type == TOK_ERROR);
-        do_test(token.error == TOK_UNTERMINATED_SLICE);
+        do_test(token.error == tokenizer_error_t::unterminated_slice);
        do_test(token.error_offset == 4);
    }

@@ -1735,7 +1735,9 @@ static void test_abbreviations() {
    env_push(true);

    const std::vector<std::pair<const wcstring, const wcstring>> abbreviations = {
-        {L"gc", L"git checkout"}, {L"foo", L"bar"}, {L"gx", L"git checkout"},
+        {L"gc", L"git checkout"},
+        {L"foo", L"bar"},
+        {L"gx", L"git checkout"},
    };
    for (auto it : abbreviations) {
        int ret = env_set_one(L"_fish_abbr_" + it.first, ENV_LOCAL, it.second);
--- a/src/parse_constants.h
+++ b/src/parse_constants.h
@@ -169,7 +169,6 @@ enum parse_error_code_t {
    parse_error_tokenizer_unterminated_subshell,
    parse_error_tokenizer_unterminated_slice,
    parse_error_tokenizer_unterminated_escape,
-    parse_error_tokenizer_nested_slice,
    parse_error_tokenizer_other,

    parse_error_unbalancing_end,   // end outside of block
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@@ -27,6 +27,23 @@ static bool production_is_empty(const production_element_t *production) {
    return *production == token_type_invalid;
 }

+static parse_error_code_t parse_error_from_tokenizer_error(tokenizer_error_t err) {
+    switch (err) {
+        case tokenizer_error_t::none:
+            return parse_error_none;
+        case tokenizer_error_t::unterminated_quote:
+            return parse_error_tokenizer_unterminated_quote;
+        case tokenizer_error_t::unterminated_subshell:
+            return parse_error_tokenizer_unterminated_subshell;
+        case tokenizer_error_t::unterminated_slice:
+            return parse_error_tokenizer_unterminated_slice;
+        case tokenizer_error_t::unterminated_escape:
+            return parse_error_tokenizer_unterminated_escape;
+        default:
+            return parse_error_tokenizer_other;
+    }
+}
+
 /// Returns a string description of this parse error.
 wcstring parse_error_t::describe_with_prefix(const wcstring &src, const wcstring &prefix,
                                             bool is_interactive, bool skip_caret) const {
@@ -671,10 +688,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
 }

 void parse_ll_t::report_tokenizer_error(const tok_t &tok) {
-    parse_error_code_t parse_error_code = tok.error->parser_error;
+    parse_error_code_t parse_error_code = parse_error_from_tokenizer_error(tok.error);
    this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset,
                                  parse_error_code, L"%ls",
-                                  tok.error->Message());
+                                  tokenizer_get_error_message(tok.error).c_str());
 }

 void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
@@ -794,8 +811,7 @@ bool parse_ll_t::top_node_handle_terminal_types(const parse_token_t &token) {
        node.keyword = token.keyword;
        node.source_start = token.source_start;
        node.source_length = token.source_length;
-        if (token.preceding_escaped_nl)
-            node.flags |= parse_node_flag_preceding_escaped_nl;
+        if (token.preceding_escaped_nl) node.flags |= parse_node_flag_preceding_escaped_nl;
    } else {
        // Failure
        if (stack_top.type == parse_token_type_string && token.type == parse_token_type_string) {
@@ -863,8 +879,7 @@ void parse_ll_t::accept_tokens(parse_token_t token1, parse_token_t token2) {
        special_node.parent = symbol_stack.back().node_idx;
        special_node.source_start = token1.source_start;
        special_node.source_length = token1.source_length;
-        if (token1.preceding_escaped_nl)
-            special_node.flags |= parse_node_flag_preceding_escaped_nl;
+        if (token1.preceding_escaped_nl) special_node.flags |= parse_node_flag_preceding_escaped_nl;
        nodes.push_back(special_node);

        // Mark special flags.
@@ -1206,4 +1221,3 @@ const parse_node_t *parse_node_tree_t::find_last_node_of_type(parse_token_type_t
    }
    return result;
 }
-
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -17,31 +17,46 @@
 #include "tokenizer.h"
 #include "wutil.h"  // IWYU pragma: keep

-tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L"");
-tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote);
-tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell);
-tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice);
-tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape);
-tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection"));
-tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output"));
-tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis"));
-tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location"));
-tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion"));
-tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion"));
-tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'"));
-tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'"));
-
-const wchar_t *tokenizer_error::Message() const {
-    return _(_message);
+wcstring tokenizer_get_error_message(tokenizer_error_t err) {
+    switch (err) {
+        case tokenizer_error_t::none:
+            return L"";
+        case tokenizer_error_t::unterminated_quote:
+            return _(L"Unexpected end of string, quotes are not balanced");
+        case tokenizer_error_t::unterminated_subshell:
+            return _(L"Unexpected end of string, expecting ')'");
+        case tokenizer_error_t::unterminated_slice:
+            return _(L"Unexpected end of string, square brackets do not match");
+        case tokenizer_error_t::unterminated_escape:
+            return _(L"Unexpected end of string, incomplete escape sequence");
+        case tokenizer_error_t::invalid_redirect:
+            return _(L"Invalid input/output redirection");
+        case tokenizer_error_t::invalid_pipe:
+            return _(L"Cannot use stdin (fd 0) as pipe output");
+        case tokenizer_error_t::closing_unopened_subshell:
+            return _(L"Unexpected ')' for unopened parenthesis");
+        case tokenizer_error_t::illegal_slice:
+            return _(L"Unexpected '[' at this location");
+        case tokenizer_error_t::closing_unopened_brace:
+            return _(L"Unexpected '}' for unopened brace expansion");
+        case tokenizer_error_t::unterminated_brace:
+            return _(L"Unexpected end of string, incomplete parameter expansion");
+        case tokenizer_error_t::expected_pclose_found_bclose:
+            return _(L"Unexpected '}' found, expecting ')'");
+        case tokenizer_error_t::expected_bclose_found_pclose:
+            return _(L"Unexpected ')' found, expecting '}'");
+    }
+    assert(0 && "Unexpected tokenizer error");
+    return NULL;
 }

 // Whether carets redirect stderr.
 static bool caret_redirs() { return !feature_test(features_t::stderr_nocaret); }

 /// Return an error token and mark that we no longer have a next token.
-tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start,
+tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token_start,
                              const wchar_t *error_loc) {
-    assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error");
+    assert(error_type != tokenizer_error_t::none && "tokenizer_error_t::none passed to call_error");
    assert(error_loc >= token_start && "Invalid error location");
    assert(this->buff >= token_start && "Invalid buff location");

@@ -160,11 +175,13 @@ tok_t tokenizer_t::read_string() {
            mode |= tok_modes::curly_braces;
        } else if (c == L')') {
            if (expecting.size() > 0 && expecting.back() == L'}') {
-                return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff);
+                return this->call_error(tokenizer_error_t::expected_bclose_found_pclose,
+                                        this->start, this->buff);
            }
            switch (paran_offsets.size()) {
                case 0:
-                    return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff);
+                    return this->call_error(tokenizer_error_t::closing_unopened_subshell,
+                                            this->start, this->buff);
                case 1:
                    mode &= ~(tok_modes::subshell);
                default:
@@ -173,11 +190,13 @@ tok_t tokenizer_t::read_string() {
            expecting.pop_back();
        } else if (c == L'}') {
            if (expecting.size() > 0 && expecting.back() == L')') {
-                return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff);
+                return this->call_error(tokenizer_error_t::expected_pclose_found_bclose,
+                                        this->start, this->buff);
            }
            switch (brace_offsets.size()) {
                case 0:
-                    return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff);
+                    return this->call_error(tokenizer_error_t::closing_unopened_brace, this->start,
+                                            this->buff);
                case 1:
                    mode &= ~(tok_modes::curly_braces);
                default:
@@ -188,17 +207,18 @@ tok_t tokenizer_t::read_string() {
            if (this->buff != buff_start) {
                if ((mode & tok_modes::array_brackets) == tok_modes::array_brackets) {
                    // Nested brackets should not overwrite the existing slice_offset
-                    //mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell
-                    //prints an error message with the caret pointing at token_start,
-                    //not err_loc, making the TOK_ILLEGAL_SLICE message misleading.
+                    // mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell
+                    // prints an error message with the caret pointing at token_start,
+                    // not err_loc, making the TOK_ILLEGAL_SLICE message misleading.
                    // return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff);
-                    return this->call_error(TOK_UNTERMINATED_SLICE, this->start, this->buff);
+                    return this->call_error(tokenizer_error_t::unterminated_slice, this->start,
+                                            this->buff);
                }
                slice_offset = this->buff - this->start;
                mode |= tok_modes::array_brackets;
-            }
-            else {
-                // This is actually allowed so the test operator `[` can be used as the head of a command
+            } else {
+                // This is actually allowed so the test operator `[` can be used as the head of a
+                // command
            }
        }
        // Only exit bracket mode if we are in bracket mode.
@@ -214,7 +234,8 @@ tok_t tokenizer_t::read_string() {
                const wchar_t *error_loc = this->buff;
                this->buff += wcslen(this->buff);
                if ((!this->accept_unfinished)) {
-                    return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, error_loc);
+                    return this->call_error(tokenizer_error_t::unterminated_quote, buff_start,
+                                            error_loc);
                }
                break;
            }
@@ -238,23 +259,23 @@ tok_t tokenizer_t::read_string() {
    if ((!this->accept_unfinished) && (mode != tok_modes::regular_text)) {
        tok_t error;
        if ((mode & tok_modes::char_escape) == tok_modes::char_escape) {
-            error = this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start,
-                    this->buff - 1);
+            error = this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
+                                     this->buff - 1);
        } else if ((mode & tok_modes::array_brackets) == tok_modes::array_brackets) {
-            error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start,
-                    this->start + slice_offset);
+            error = this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
+                                     this->start + slice_offset);
        } else if ((mode & tok_modes::subshell) == tok_modes::subshell) {
            assert(paran_offsets.size() > 0);
            size_t offset_of_open_paran = paran_offsets.back();

-            error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
-                    this->start + offset_of_open_paran);
+            error = this->call_error(tokenizer_error_t::unterminated_subshell, buff_start,
+                                     this->start + offset_of_open_paran);
        } else if ((mode & tok_modes::curly_braces) == tok_modes::curly_braces) {
            assert(brace_offsets.size() > 0);
            size_t offset_of_open_brace = brace_offsets.back();

-            error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start,
-                    this->start + offset_of_open_brace);
+            error = this->call_error(tokenizer_error_t::unterminated_brace, buff_start,
+                                     this->start + offset_of_open_brace);
        }
        return error;
    }
@@ -513,7 +534,8 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
            // here is that we must never parse these as a string; a failed redirection is an error!
            auto redir_or_pipe = read_redirection_or_fd_pipe(this->buff);
            if (!redir_or_pipe || redir_or_pipe->fd < 0) {
-                return this->call_error(TOK_INVALID_REDIRECT, this->buff, this->buff);
+                return this->call_error(tokenizer_error_t::invalid_redirect, this->buff,
+                                        this->buff);
            }
            result.type = redir_or_pipe->type;
            result.redirected_fd = redir_or_pipe->fd;
@@ -534,7 +556,8 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
                // that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer
                // error.
                if (redir_or_pipe->type == TOK_PIPE && redir_or_pipe->fd == 0) {
-                    return this->call_error(TOK_INVALID_PIPE, error_location, error_location);
+                    return this->call_error(tokenizer_error_t::invalid_pipe, error_location,
+                                            error_location);
                }
                result.type = redir_or_pipe->type;
                result.redirected_fd = redir_or_pipe->fd;
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -23,29 +23,6 @@ enum token_type {
    TOK_COMMENT      /// comment token
 };

-struct tokenizer_error {
-private:
-    const wchar_t *_message;
-public:
-    const wchar_t *Message() const;
-    enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error
-    tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other)
-        : _message(msg), parser_error(perr) {}
-    tokenizer_error(const tokenizer_error&) = delete;
-};
-
-extern tokenizer_error *TOK_ERROR_NONE;
-extern tokenizer_error *TOK_UNTERMINATED_QUOTE;
-extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL;
-extern tokenizer_error *TOK_UNTERMINATED_SLICE;
-extern tokenizer_error *TOK_UNTERMINATED_ESCAPE;
-extern tokenizer_error *TOK_UNTERMINATED_BRACE;
-extern tokenizer_error *TOK_INVALID_REDIRECT;
-extern tokenizer_error *TOK_INVALID_PIPE;
-extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL;
-extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE;
-extern tokenizer_error *TOK_ILLEGAL_SLICE;
-
 enum class redirection_type_t {
    overwrite,  // normal redirection: > file.txt
    append,     // appending redirection: >> file.txt
@@ -67,6 +44,25 @@ enum class redirection_type_t {

 typedef unsigned int tok_flags_t;

+enum class tokenizer_error_t {
+    none,
+    unterminated_quote,
+    unterminated_subshell,
+    unterminated_slice,
+    unterminated_escape,
+    invalid_redirect,
+    invalid_pipe,
+    closing_unopened_subshell,
+    illegal_slice,
+    closing_unopened_brace,
+    unterminated_brace,
+    expected_pclose_found_bclose,
+    expected_bclose_found_pclose,
+};
+
+/// Get the error message for an error \p err.
+wcstring tokenizer_get_error_message(tokenizer_error_t err);
+
 struct tok_t {
    // The type of the token.
    token_type type{TOK_NONE};
@@ -80,7 +76,7 @@ struct tok_t {
    maybe_t<int> redirected_fd{};

    // If an error, this is the error code.
-    tokenizer_error *error { TOK_ERROR_NONE };
+    tokenizer_error_t error{tokenizer_error_t::none};

    // Whether the token was preceded by an escaped newline.
    bool preceding_escaped_nl{false};
@@ -113,7 +109,7 @@ class tokenizer_t {
    /// Whether to continue the previous line after the comment.
    bool continue_line_after_comment{false};

-    tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start,
+    tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start,
                     const wchar_t *error_loc);
    tok_t read_string();
    maybe_t<tok_t> tok_next();
@@ -156,9 +152,6 @@ int fd_redirected_by_pipe(const wcstring &str);
 /// Helper function to return oflags (as in open(2)) for a redirection type.
 int oflags_for_redirection_type(redirection_type_t type);

-/// Returns an error message for an error code.
-wcstring error_message_for_code(tokenizer_error err);
-
 enum move_word_style_t {
    move_word_style_punctuation,      // stop at punctuation
    move_word_style_path_components,  // stops at path components