More work on new parser

2026-06-03 15:01:16 -03:00 · 2013-08-08 15:06:46 -07:00
parent 6a6593335d
commit 8e07e55c1f
9 changed files with 708 additions and 32 deletions
--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@@ -101,6 +101,11 @@ wcstring token_type_description(parse_token_type_t type)
            return L"arguments_or_redirections_list";
        case symbol_argument_or_redirection:
            return L"argument_or_redirection";
+        case symbol_argument:
+            return L"symbol_argument";
+        case symbol_redirection:
+            return L"symbol_redirection";
+

        case parse_token_type_string:
            return L"token_string";
@@ -116,6 +121,14 @@ wcstring token_type_description(parse_token_type_t type)
            return L"token_terminate";
        case symbol_optional_background:
            return L"optional_background";
+        
+        case parse_special_type_parse_error:
+            return L"parse_error";
+        case parse_special_type_tokenizer_error:
+            return L"tokenizer_error";
+        case parse_special_type_comment:
+            return L"comment";
+
    }
    return format_string(L"Unknown token type %ld", static_cast<long>(type));
 }
@@ -216,6 +229,14 @@ static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_
        case TOK_REDIRECT_NOCLOB:
            result.type = parse_token_type_redirection;
            break;
+            
+        case TOK_ERROR:
+            result.type = parse_special_type_tokenizer_error;
+            break;
+            
+        case TOK_COMMENT:
+            result.type = parse_special_type_comment;
+            break;


        default:
@@ -248,9 +269,16 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
    }
    if (node.type == parse_token_type_string)
    {
-        result->append(L": \"");
-        result->append(src, node.source_start, node.source_length);
-        result->append(L"\"");
+        if (node.source_start == -1)
+        {
+            append_format(*result, L" (no source)");
+        }
+        else
+        {
+            result->append(L": \"");
+            result->append(src, node.source_start, node.source_length);
+            result->append(L"\"");
+        }
    }
    result->push_back(L'\n');
    ++*line;
@@ -311,20 +339,24 @@ class parse_ll_t
    // Constructor
    parse_ll_t() : fatal_errored(false)
    {
-        // initial node
-        symbol_stack.push_back(parse_stack_element_t(symbol_job_list, 0)); // goal token
-        nodes.push_back(parse_node_t(symbol_job_list));
+        this->reset();
    }

    bool top_node_match_token(parse_token_t token);

    void accept_token(parse_token_t token, const wcstring &src);
+    
+    // Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node.
+    void reset(void);

    void parse_error(const wchar_t *expected, parse_token_t token);
    void parse_error(parse_token_t token, const wchar_t *format, ...);
    void append_error_callout(wcstring &error_message, parse_token_t token);

    void dump_stack(void) const;
+    
+    // Figure out the ranges of intermediate nodes
+    void determine_node_ranges();

    // Get the node corresponding to the top element of the stack
    parse_node_t &node_for_top_symbol()
@@ -453,9 +485,41 @@ void parse_ll_t::dump_stack(void) const
    }
 }

+// Give each node a source range equal to the union of the ranges of its children
+// Terminal nodes already have source ranges (and no children)
+// Since children always appear after their parents, we can implement this very simply by walking backwards
+void parse_ll_t::determine_node_ranges(void)
+{
+    const size_t source_start_invalid = -1;
+    size_t idx = nodes.size();
+    while (idx--)
+    {
+        parse_node_t *parent = &nodes.at(idx);
+        
+        // Skip nodes that already have a source range. These are terminal nodes.
+        if (parent->source_start != source_start_invalid)
+            continue;
+        
+        // Ok, this node needs a source range. Get all of its children, and then set its range.
+        size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge
+        for (node_offset_t i=0; i < parent->child_count; i++)
+        {
+            const parse_node_t &child = nodes.at(parent->child_offset(i));
+            min_start = std::min(min_start, child.source_start);
+            max_end = std::max(max_end, child.source_start + child.source_length);
+        }
+        
+        if (min_start != source_start_invalid) {
+            assert(max_end >= min_start);
+            parent->source_start = min_start;
+            parent->source_length = max_end - min_start;
+        }
+    }
+}
+
 void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...)
 {
-    this->dump_stack();
+    //this->dump_stack();
    parse_error_t err;
    
    va_list va;
@@ -481,8 +545,27 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token)
    fatal_errored = true;
 }

+void parse_ll_t::reset(void)
+{
+    // add a new job_list node and then reset our symbol list to point at it
+    node_offset_t where = nodes.size();
+    nodes.push_back(parse_node_t(symbol_job_list));
+    
+    symbol_stack.clear();
+    symbol_stack.push_back(parse_stack_element_t(symbol_job_list, where)); // goal token
+    this->fatal_errored = false;
+}
+
+
 bool parse_ll_t::top_node_match_token(parse_token_t token)
 {
+    if (symbol_stack.empty())
+    {
+        // This can come about with an unbalanced 'end' or 'else', which causes us to terminate the outermost job list.
+        this->fatal_errored = true;
+        return false;
+    }
+    
    PARSE_ASSERT(! symbol_stack.empty());
    PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
    bool result = false;
@@ -520,10 +603,23 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
        fprintf(stderr, "Accept token %ls\n", token.describe().c_str());
    }
    PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
-    PARSE_ASSERT(! symbol_stack.empty());
+    
    bool consumed = false;
+    
+    // Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
+    if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment)
+    {
+        parse_node_t err_node(token.type);
+        err_node.source_start = token.source_start;
+        err_node.source_length = token.source_length;
+        nodes.push_back(err_node);
+        consumed = true;
+    }
+
    while (! consumed && ! this->fatal_errored)
    {
+        PARSE_ASSERT(! symbol_stack.empty());
+        
        if (top_node_match_token(token))
        {
            if (logit)
@@ -534,6 +630,10 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
            break;
        }
        
+        // top_node_match_token may indicate an error if our stack is empty
+        if (this->fatal_errored)
+            break;
+        
        // Get the production for the top of the stack
        parse_stack_element_t &stack_elem = symbol_stack.back();
        parse_node_t &node = nodes.at(stack_elem.node_idx);
@@ -548,6 +648,12 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
            // Manipulate the symbol stack.
            // Note that stack_elem is invalidated by popping the stack.
            symbol_stack_pop_push_production(production);
+            
+            // If we end up with an empty stack, something bad happened, like an unbalanced end
+            if (symbol_stack.empty())
+            {
+                this->parse_error(token, L"All symbols removed from symbol stack. Likely unbalanced else or end?", stack_elem.describe().c_str(), token.describe().c_str());
+            }
        }
    }
 }
@@ -556,6 +662,11 @@ parse_t::parse_t() : parser(new parse_ll_t())
 {
 }

+parse_t::~parse_t()
+{
+    delete parser;
+}
+
 static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
 {
    parse_keyword_t result = parse_keyword_none;
@@ -597,21 +708,20 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
    return result;
 }

-bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors)
+bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it)
 {
-    tokenizer_t tok = tokenizer_t(str.c_str(), 0);
+    tok_flags_t tok_options = TOK_SQUASH_ERRORS;
+    if (parse_flags & parse_flag_include_comments)
+        tok_options |= TOK_SHOW_COMMENTS;
+        
+    tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
    for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok))
    {
        token_type tok_type = static_cast<token_type>(tok_last_type(&tok));
        const wchar_t *tok_txt = tok_last(&tok);
        int tok_start = tok_get_pos(&tok);
        size_t tok_extent = tok_get_extent(&tok);
-
-        if (tok_type == TOK_ERROR)
-        {
-            fprintf(stderr, "Tokenizer error\n");
-            break;
-        }
+        assert(tok_extent < 10000000); //paranoia

        parse_token_t token = parse_token_from_tokenizer_token(tok_type);
        token.tokenizer_type = tok_type;
@@ -621,12 +731,31 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
        this->parser->accept_token(token, str);
        
        if (this->parser->fatal_errored)
-            break;
+        {
+            if (parse_flags & parse_flag_continue_after_error)
+            {
+                /* Mark an error and then keep going */
+                token.type = parse_special_type_parse_error;
+                token.keyword = parse_keyword_none;
+                this->parser->accept_token(token, str);
+                this->parser->reset();
+            }
+            else
+            {
+                /* Bail out */
+                break;
+            }
+        }
    }

+    // Teach each node where its source range is
+    this->parser->determine_node_ranges();
+
+#if 0
    wcstring result = dump_tree(this->parser->nodes, str);
    fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
    fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
+#endif

    if (output != NULL)
    {
@@ -642,3 +771,40 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_

    return ! this->parser->fatal_errored;
 }
+
+const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type) const
+{
+    const parse_node_t *result = NULL;
+    PARSE_ASSERT(which < parent.child_count);
+    node_offset_t child_offset = parent.child_offset(which);
+    if (child_offset < this->size())
+    {
+        result = &this->at(child_offset);
+    }
+    
+    // If we are given an expected type, then the node must be null or that type
+    if (result != NULL)
+    {
+        assert(expected_type == token_type_invalid || expected_type == result->type);
+    }
+    
+    return result;
+}
+
+static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, parse_node_tree_t::parse_node_list_t *result)
+{
+    if (parent.type == type) result->push_back(&parent);
+    for (size_t i=0; i < parent.child_count; i++)
+    {
+        const parse_node_t *child = tree.get_child(parent, i);
+        assert(child != NULL);
+        find_nodes_recursive(tree, *child, type, result);
+    }
+}
+
+parse_node_tree_t::parse_node_list_t parse_node_tree_t::find_nodes(const parse_node_t &parent, parse_token_type_t type) const
+{
+    parse_node_list_t result;
+    find_nodes_recursive(*this, parent, type, &result);
+    return result;
+}