From b13ee818d281b7bdf91ddfdcc504a501ba14b50c Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Mon, 19 Feb 2018 15:10:10 -0800 Subject: [PATCH] Some early cleanup of tokenizer Prior to this the tokenizer ran "one ahead", where tokenizer_t::next() would in fact return the last-parsed token. Switch to parsing on demand instead of running one ahead; this is simpler and prepares for tokenizer changes. --- src/builtin_commandline.cpp | 5 +-- src/fish_tests.cpp | 32 +++++++++++++-- src/tokenizer.cpp | 79 +++++++++++++------------------------ src/tokenizer.h | 30 +++++++------- 4 files changed, 74 insertions(+), 72 deletions(-) diff --git a/src/builtin_commandline.cpp b/src/builtin_commandline.cpp index 27abd183d..2586e5a15 100644 --- a/src/builtin_commandline.cpp +++ b/src/builtin_commandline.cpp @@ -137,10 +137,10 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs size_t pos = get_cursor_pos() - (begin - get_buffer()); if (tokenize) { - wchar_t *buff = wcsndup(begin, end - begin); // fwprintf( stderr, L"Subshell: %ls, end char %lc\n", buff, *end ); wcstring out; - tokenizer_t tok(buff, TOK_ACCEPT_UNFINISHED); + wcstring buff(begin, end - begin); + tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED); tok_t token; while (tok.next(&token)) { if ((cut_at_cursor) && (token.offset + token.text.size() >= pos)) break; @@ -154,7 +154,6 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs } streams.out.append(out); - free(buff); } else { if (cut_at_cursor) { streams.out.append(begin, pos); diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 63e8ef43d..6904a3e2c 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -509,6 +509,29 @@ static void test_tokenizer() { say(L"Testing tokenizer"); tok_t token; + { + bool got = false; + const wchar_t *str = L"alpha beta"; + tokenizer_t t(str, 0); + + got = t.next(&token); // alpha + do_test(got); + do_test(token.type == TOK_STRING); + do_test(token.offset == 0); + do_test(token.length == 5); + do_test(token.text == L"alpha"); + + got = t.next(&token); // beta + do_test(got); + do_test(token.type == TOK_STRING); + do_test(token.offset == 6); + do_test(token.length == 4); + do_test(token.text == L"beta"); + + got = t.next(&token); + do_test(!got); + } + const wchar_t *str = L"string &1 'nested \"quoted\" '(string containing subshells " L"){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect " @@ -524,14 +547,17 @@ static void test_tokenizer() { tokenizer_t t(str, 0); size_t i = 0; while (t.next(&token)) { - if (i > sizeof types / sizeof *types) { + if (i >= sizeof types / sizeof *types) { err(L"Too many tokens returned from tokenizer"); + fwprintf(stdout, L"Got excess token type %ld\n", (long)token.type); break; } if (types[i] != token.type) { err(L"Tokenization error:"); - fwprintf(stdout, L"Token number %zu of string \n'%ls'\n, got token type %ld\n", - i + 1, str, (long)token.type); + fwprintf(stdout, + L"Token number %zu of string \n'%ls'\n, expected type %ld, got token type " + L"%ld\n", + i + 1, str, (long)types[i], (long)token.type); } i++; } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index dccbd8191..c777e11ca 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -45,41 +45,27 @@ void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *whe const wchar_t *error_message) { this->last_type = TOK_ERROR; this->error = error_type; - this->global_error_offset = where ? where - this->orig_buff : 0; + this->global_error_offset = where ? where - this->start : 0; this->last_token = error_message; + this->has_next = false; } -tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) - : buff(b), - orig_buff(b), - last_type(TOK_NONE), - last_pos(0), - has_next(false), - accept_unfinished(false), - show_comments(false), - show_blank_lines(false), - error(TOK_ERROR_NONE), - global_error_offset(-1), - squash_errors(false), - continue_line_after_comment(false) { - assert(b != NULL); +tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) { + assert(start != nullptr && "Invalid start"); this->accept_unfinished = static_cast(flags & TOK_ACCEPT_UNFINISHED); this->show_comments = static_cast(flags & TOK_SHOW_COMMENTS); this->squash_errors = static_cast(flags & TOK_SQUASH_ERRORS); this->show_blank_lines = static_cast(flags & TOK_SHOW_BLANK_LINES); - - this->has_next = (*b != L'\0'); - this->tok_next(); } bool tokenizer_t::next(struct tok_t *result) { assert(result != NULL); - if (!this->has_next) { + if (!this->tok_next()) { return false; } - const size_t current_pos = this->buff - this->orig_buff; + const size_t current_pos = this->buff - this->start; // We want to copy our last_token into result->text. If we just do this naively via =, we are // liable to trigger std::string's CoW implementation: result->text's storage will be @@ -92,7 +78,7 @@ bool tokenizer_t::next(struct tok_t *result) { result->type = this->last_type; result->offset = this->last_pos; result->error = this->last_type == TOK_ERROR ? this->error : TOK_ERROR_NONE; - assert(this->buff >= this->orig_buff); + assert(this->buff >= this->start); // Compute error offset. result->error_offset = 0; @@ -101,10 +87,8 @@ bool tokenizer_t::next(struct tok_t *result) { result->error_offset = this->global_error_offset - this->last_pos; } - assert(this->buff >= this->orig_buff); + assert(this->buff >= this->start); result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0; - - this->tok_next(); return true; } @@ -149,7 +133,7 @@ void tokenizer_t::read_string() { size_t paran_offsets[paran_offsets_max]; // Where the open bracket is. size_t offset_of_bracket = 0; - const wchar_t *const start = this->buff; + const wchar_t *const buff_start = this->buff; bool is_first = true; enum tok_mode_t { @@ -186,14 +170,14 @@ void tokenizer_t::read_string() { switch (*this->buff) { case L'(': { paran_count = 1; - paran_offsets[0] = this->buff - this->orig_buff; + paran_offsets[0] = this->buff - this->start; mode = mode_subshell; break; } case L'[': { - if (this->buff != start) { + if (this->buff != buff_start) { mode = mode_array_brackets; - offset_of_bracket = this->buff - this->orig_buff; + offset_of_bracket = this->buff - this->start; } break; } @@ -247,7 +231,7 @@ void tokenizer_t::read_string() { } case L'(': { if (paran_count < paran_offsets_max) { - paran_offsets[paran_count] = this->buff - this->orig_buff; + paran_offsets[paran_count] = this->buff - this->start; } paran_count++; break; @@ -277,7 +261,7 @@ void tokenizer_t::read_string() { switch (*this->buff) { case L'(': { paran_count = 1; - paran_offsets[0] = this->buff - this->orig_buff; + paran_offsets[0] = this->buff - this->start; mode = mode_array_brackets_and_subshell; break; } @@ -315,13 +299,13 @@ void tokenizer_t::read_string() { } TOK_CALL_ERROR(this, TOK_UNTERMINATED_SUBSHELL, PARAN_ERROR, - this->orig_buff + offset_of_open_paran); + this->start + offset_of_open_paran); break; } case mode_array_brackets: case mode_array_brackets_and_subshell: { TOK_CALL_ERROR(this, TOK_UNTERMINATED_SLICE, SQUARE_BRACKET_ERROR, - this->orig_buff + offset_of_bracket); + this->start + offset_of_bracket); break; } default: { @@ -332,9 +316,9 @@ void tokenizer_t::read_string() { return; } - len = this->buff - start; + len = this->buff - buff_start; - this->last_token.assign(start, len); + this->last_token.assign(buff_start, len); this->last_type = TOK_STRING; } @@ -483,16 +467,9 @@ int oflags_for_redirection_type(enum token_type type) { /// to be whitespace. static bool my_iswspace(wchar_t c) { return c != L'\n' && iswspace(c); } -void tokenizer_t::tok_next() { - if (this->last_type == TOK_ERROR) { - this->has_next = false; - return; - } - +bool tokenizer_t::tok_next() { if (!this->has_next) { - // fwprintf(stdout, L"EOL\n" ); - this->last_type = TOK_END; - return; + return false; } while (1) { @@ -508,29 +485,28 @@ void tokenizer_t::tok_next() { while (*this->buff == L'#') { if (this->show_comments) { - this->last_pos = this->buff - this->orig_buff; + this->last_pos = this->buff - this->start; this->read_comment(); if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++; - return; + return true; } - while (*(this->buff) != L'\n' && *(this->buff) != L'\0') this->buff++; + while (*this->buff != L'\n' && *this->buff != L'\0') this->buff++; if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++; - while (my_iswspace(*(this->buff))) this->buff++; + while (my_iswspace(*this->buff)) this->buff++; } this->continue_line_after_comment = false; - this->last_pos = this->buff - this->orig_buff; + this->last_pos = this->buff - this->start; switch (*this->buff) { case L'\0': { this->last_type = TOK_END; - // fwprintf( stderr, L"End of string\n" ); this->has_next = false; this->last_token.clear(); - break; + return false; } case L'\r': // carriage-return case L'\n': // newline @@ -604,11 +580,12 @@ void tokenizer_t::tok_next() { break; } } + return true; } wcstring tok_first(const wcstring &str) { wcstring result; - tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS); + tokenizer_t t(str.data(), TOK_SQUASH_ERRORS); tok_t token; if (t.next(&token) && token.type == TOK_STRING) { result = std::move(token.text); diff --git a/src/tokenizer.h b/src/tokenizer.h index c87a696b2..f870aaff9 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -71,41 +71,41 @@ struct tok_t { /// The tokenizer struct. class tokenizer_t { // No copying, etc. - tokenizer_t(const tokenizer_t &); - void operator=(const tokenizer_t &); + tokenizer_t(const tokenizer_t &) = delete; + void operator=(const tokenizer_t &) = delete; /// A pointer into the original string, showing where the next token begins. const wchar_t *buff; - /// A copy of the original string. - const wchar_t *orig_buff; + /// The start of the original string. + const wchar_t *const start; /// The last token. wcstring last_token; /// Type of last token. - enum token_type last_type; + enum token_type last_type { TOK_NONE }; /// Offset of last token. - size_t last_pos; + size_t last_pos{0}; /// Whether there are more tokens. - bool has_next; + bool has_next{true}; /// Whether incomplete tokens are accepted. - bool accept_unfinished; + bool accept_unfinished{false}; /// Whether comments should be returned. - bool show_comments; + bool show_comments{false}; /// Whether all blank lines are returned. - bool show_blank_lines; + bool show_blank_lines{false}; /// Last error. - tokenizer_error error; + tokenizer_error error{TOK_ERROR_NONE}; /// Last error offset, in "global" coordinates (relative to orig_buff). - size_t global_error_offset; + size_t global_error_offset{size_t(-1)}; /// Whether we are squashing errors. - bool squash_errors; + bool squash_errors{false}; /// Whether to continue the previous line after the comment. - bool continue_line_after_comment; + bool continue_line_after_comment{false}; void call_error(enum tokenizer_error error_type, const wchar_t *where, const wchar_t *error_message); void read_string(); void read_comment(); - void tok_next(); + bool tok_next(); public: /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and