From 1a65e18ba8cfbfede331b04d8a091be8490c5a80 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Sun, 13 Oct 2019 16:06:16 -0700 Subject: [PATCH] Clean up some tokenization Remove TOK_NONE Turn token_type into an enum class Make next() turn a maybe_t instead of a bool --- src/builtin_commandline.cpp | 9 ++- src/fish_test_helper.cpp | 8 +++ src/fish_tests.cpp | 96 +++++++++++++++-------------- src/parse_tree.cpp | 55 ++++++++--------- src/parse_util.cpp | 79 ++++++++++++------------ src/parse_util.h | 2 +- src/reader.cpp | 15 +++-- src/tokenizer.cpp | 116 ++++++++++++++++++------------------ src/tokenizer.h | 31 +++++----- 9 files changed, 210 insertions(+), 201 deletions(-) diff --git a/src/builtin_commandline.cpp b/src/builtin_commandline.cpp index 0c849612b..cf309b498 100644 --- a/src/builtin_commandline.cpp +++ b/src/builtin_commandline.cpp @@ -100,12 +100,11 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs wcstring out; wcstring buff(begin, end - begin); tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED); - tok_t token; - while (tok.next(&token)) { - if ((cut_at_cursor) && (token.offset + token.length >= pos)) break; + while (auto token = tok.next()) { + if ((cut_at_cursor) && (token->offset + token->length >= pos)) break; - if (token.type == TOK_STRING) { - wcstring tmp = tok.text_of(token); + if (token->type == token_type_t::string) { + wcstring tmp = tok.text_of(*token); unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE); out.append(tmp); out.push_back(L'\n'); diff --git a/src/fish_test_helper.cpp b/src/fish_test_helper.cpp index 5ee10407e..dd70b3ed6 100644 --- a/src/fish_test_helper.cpp +++ b/src/fish_test_helper.cpp @@ -39,6 +39,12 @@ static void sigint_parent() { fprintf(stderr, "Sent SIGINT to %d\n", parent); } +static void print_stdout_stderr() { + fprintf(stdout, "stdout\n"); + fprintf(stderr, "stderr\n"); + fflush(nullptr); +} + int main(int argc, char *argv[]) { if (argc <= 1) { fprintf(stderr, "No commands given.\n"); @@ -51,6 +57,8 @@ int main(int argc, char *argv[]) { report_foreground(); } else if (!strcmp(argv[i], "sigint_parent")) { sigint_parent(); + } else if (!strcmp(argv[i], "print_stdout_stderr")) { + print_stdout_stderr(); } else { fprintf(stderr, "%s: Unknown command: %s\n", argv[0], argv[i]); return EXIT_FAILURE; diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index e5837704e..2a566fd23 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -564,29 +564,27 @@ static void test_convert_nulls() { /// Test the tokenizer. static void test_tokenizer() { say(L"Testing tokenizer"); - tok_t token; - { - bool got = false; const wchar_t *str = L"alpha beta"; tokenizer_t t(str, 0); + maybe_t token{}; - got = t.next(&token); // alpha - do_test(got); - do_test(token.type == TOK_STRING); - do_test(token.offset == 0); - do_test(token.length == 5); - do_test(t.text_of(token) == L"alpha"); + token = t.next(); // alpha + do_test(token.has_value()); + do_test(token->type == token_type_t::string); + do_test(token->offset == 0); + do_test(token->length == 5); + do_test(t.text_of(*token) == L"alpha"); - got = t.next(&token); // beta - do_test(got); - do_test(token.type == TOK_STRING); - do_test(token.offset == 6); - do_test(token.length == 4); - do_test(t.text_of(token) == L"beta"); + token = t.next(); // beta + do_test(token.has_value()); + do_test(token->type == token_type_t::string); + do_test(token->offset == 6); + do_test(token->length == 4); + do_test(t.text_of(*token) == L"beta"); - got = t.next(&token); - do_test(!got); + token = t.next(); + do_test(!token.has_value()); } const wchar_t *str = @@ -595,30 +593,31 @@ static void test_tokenizer() { L"&&& ||| " L"&& || & |" L"Compress_Newlines\n \n\t\n \nInto_Just_One"; - const int types[] = {TOK_STRING, TOK_REDIRECT, TOK_STRING, TOK_REDIRECT, TOK_STRING, - TOK_STRING, TOK_STRING, TOK_REDIRECT, TOK_REDIRECT, TOK_STRING, - TOK_ANDAND, TOK_BACKGROUND, TOK_OROR, TOK_PIPE, TOK_ANDAND, - TOK_OROR, TOK_BACKGROUND, TOK_PIPE, TOK_STRING, TOK_END, - TOK_STRING}; + using tt = token_type_t; + const token_type_t types[] = { + tt::string, tt::redirect, tt::string, tt::redirect, tt::string, tt::string, + tt::string, tt::redirect, tt::redirect, tt::string, tt::andand, tt::background, + tt::oror, tt::pipe, tt::andand, tt::oror, tt::background, tt::pipe, + tt::string, tt::end, tt::string}; say(L"Test correct tokenization"); { tokenizer_t t(str, 0); size_t i = 0; - while (t.next(&token)) { + while (auto token = t.next()) { if (i >= sizeof types / sizeof *types) { err(L"Too many tokens returned from tokenizer"); - std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token.type); + std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token->type); break; } - if (types[i] != token.type) { + if (types[i] != token->type) { err(L"Tokenization error:"); std::fwprintf( stdout, L"Token number %zu of string \n'%ls'\n, expected type %ld, got token type " L"%ld\n", - i + 1, str, (long)types[i], (long)token.type); + i + 1, str, (long)types[i], (long)token->type); } i++; } @@ -630,37 +629,44 @@ static void test_tokenizer() { // Test some errors. { tokenizer_t t(L"abc\\", 0); - do_test(t.next(&token)); - do_test(token.type == TOK_ERROR); - do_test(token.error == tokenizer_error_t::unterminated_escape); - do_test(token.error_offset == 3); + auto token = t.next(); + do_test(token.has_value()); + do_test(token->type == token_type_t::error); + do_test(token->error == tokenizer_error_t::unterminated_escape); + do_test(token->error_offset == 3); } { tokenizer_t t(L"abc )defg(hij", 0); - do_test(t.next(&token)); - do_test(t.next(&token)); - do_test(token.type == TOK_ERROR); - do_test(token.error == tokenizer_error_t::closing_unopened_subshell); - do_test(token.error_offset == 4); + auto token = t.next(); + do_test(token.has_value()); + token = t.next(); + do_test(token.has_value()); + do_test(token->type == token_type_t::error); + do_test(token->error == tokenizer_error_t::closing_unopened_subshell); + do_test(token->error_offset == 4); } { tokenizer_t t(L"abc defg(hij (klm)", 0); - do_test(t.next(&token)); - do_test(t.next(&token)); - do_test(token.type == TOK_ERROR); - do_test(token.error == tokenizer_error_t::unterminated_subshell); - do_test(token.error_offset == 4); + auto token = t.next(); + do_test(token.has_value()); + token = t.next(); + do_test(token.has_value()); + do_test(token->type == token_type_t::error); + do_test(token->error == tokenizer_error_t::unterminated_subshell); + do_test(token->error_offset == 4); } { tokenizer_t t(L"abc defg[hij (klm)", 0); - do_test(t.next(&token)); - do_test(t.next(&token)); - do_test(token.type == TOK_ERROR); - do_test(token.error == tokenizer_error_t::unterminated_slice); - do_test(token.error_offset == 4); + auto token = t.next(); + do_test(token.has_value()); + token = t.next(); + do_test(token.has_value()); + do_test(token->type == token_type_t::error); + do_test(token->error == tokenizer_error_t::unterminated_slice); + do_test(token->error_offset == 4); } // Test redirection_type_for_string. diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp index d4e261416..6b216a135 100644 --- a/src/parse_tree.cpp +++ b/src/parse_tree.cpp @@ -18,6 +18,7 @@ #include "flog.h" #include "parse_constants.h" #include "parse_productions.h" +#include "parse_tree.h" #include "proc.h" #include "tnode.h" #include "tokenizer.h" @@ -235,28 +236,25 @@ wcstring parse_token_t::user_presentable_description() const { /// Convert from tokenizer_t's token type to a parse_token_t type. static inline parse_token_type_t parse_token_type_from_tokenizer_token( - enum token_type tokenizer_token_type) { + enum token_type_t tokenizer_token_type) { switch (tokenizer_token_type) { - case TOK_NONE: - DIE("TOK_NONE passed to parse_token_type_from_tokenizer_token"); - return token_type_invalid; - case TOK_STRING: + case token_type_t::string: return parse_token_type_string; - case TOK_PIPE: + case token_type_t::pipe: return parse_token_type_pipe; - case TOK_ANDAND: + case token_type_t::andand: return parse_token_type_andand; - case TOK_OROR: + case token_type_t::oror: return parse_token_type_oror; - case TOK_END: + case token_type_t::end: return parse_token_type_end; - case TOK_BACKGROUND: + case token_type_t::background: return parse_token_type_background; - case TOK_REDIRECT: + case token_type_t::redirect: return parse_token_type_redirection; - case TOK_ERROR: + case token_type_t::error: return parse_special_type_tokenizer_error; - case TOK_COMMENT: + case token_type_t::comment: return parse_special_type_comment; } FLOGF(error, L"Bad token type %d passed to %s", (int)tokenizer_token_type, __FUNCTION__); @@ -960,9 +958,9 @@ static bool is_keyword_char(wchar_t c) { } /// Given a token, returns the keyword it matches, or parse_keyword_none. -static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token) { +static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token) { /* Only strings can be keywords */ - if (tok != TOK_STRING) { + if (tok != token_type_t::string) { return parse_keyword_none; } @@ -1009,32 +1007,35 @@ static inline bool is_help_argument(const wcstring &txt) { } /// Return a new parse token, advancing the tokenizer. -static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token, wcstring *storage) { - if (!tok->next(token)) { +static inline parse_token_t next_parse_token(tokenizer_t *tok, maybe_t *out_token, + wcstring *storage) { + *out_token = tok->next(); + if (!out_token->has_value()) { return kTerminalToken; } + const tok_t &token = **out_token; // Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy, // because it ignores quotes. This is the historical behavior. For example, `builtin --names` // lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of // this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it // even starts to look like a feature. - parse_token_t result{parse_token_type_from_tokenizer_token(token->type)}; - const wcstring &text = tok->copy_text_of(*token, storage); - result.keyword = keyword_for_token(token->type, text); + parse_token_t result{parse_token_type_from_tokenizer_token(token.type)}; + const wcstring &text = tok->copy_text_of(token, storage); + result.keyword = keyword_for_token(token.type, text); result.has_dash_prefix = !text.empty() && text.at(0) == L'-'; result.is_help_argument = result.has_dash_prefix && is_help_argument(text); result.is_newline = (result.type == parse_token_type_end && text == L"\n"); - result.preceding_escaped_nl = token->preceding_escaped_nl; + result.preceding_escaped_nl = token.preceding_escaped_nl; // These assertions are totally bogus. Basically our tokenizer works in size_t but we work in // uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just // crash. - assert(token->offset < SOURCE_OFFSET_INVALID); - result.source_start = (source_offset_t)token->offset; + assert(token.offset < SOURCE_OFFSET_INVALID); + result.source_start = (source_offset_t)token.offset; - assert(token->length <= SOURCE_OFFSET_INVALID); - result.source_length = (source_offset_t)token->length; + assert(token.length <= SOURCE_OFFSET_INVALID); + result.source_length = (source_offset_t)token.length; return result; } @@ -1063,7 +1064,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags, parse_token_t queue[2] = {kInvalidToken, kInvalidToken}; // Loop until we have a terminal token. - tok_t tokenizer_token; + maybe_t tokenizer_token{}; for (size_t token_count = 0; queue[0].type != parse_token_type_terminate; token_count++) { // Push a new token onto the queue. queue[0] = queue[1]; @@ -1084,7 +1085,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags, // Handle tokenizer errors. This is a hack because really the parser should report this for // itself; but it has no way of getting the tokenizer message. if (queue[1].type == parse_special_type_tokenizer_error) { - parser.report_tokenizer_error(tokenizer_token); + parser.report_tokenizer_error(*tokenizer_token); } if (!parser.has_fatal_error()) { diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 388baae62..a3c9e2616 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -20,6 +20,7 @@ #include "fallback.h" // IWYU pragma: keep #include "future_feature_flags.h" #include "parse_constants.h" +#include "parse_util.h" #include "parser.h" #include "tnode.h" #include "tokenizer.h" @@ -310,32 +311,32 @@ static void job_or_process_extent(const wchar_t *buff, size_t cursor_pos, const assert(buffcpy != NULL); tokenizer_t tok(buffcpy, TOK_ACCEPT_UNFINISHED); - tok_t token; - while (tok.next(&token) && !finished) { - size_t tok_begin = token.offset; + for (maybe_t token = tok.next(); token && !finished; token = tok.next()) + while ((token = tok.next()) && !finished) { + size_t tok_begin = token->offset; - switch (token.type) { - case TOK_PIPE: { - if (!process) { + switch (token->type) { + case token_type_t::pipe: { + if (!process) { + break; + } + } + /* FALLTHROUGH */ + case token_type_t::end: + case token_type_t::background: { + if (tok_begin >= pos) { + finished = 1; + if (b) *b = (wchar_t *)begin + tok_begin; + } else { + if (a) *a = (wchar_t *)begin + tok_begin + 1; + } + break; + } + default: { break; } } - /* FALLTHROUGH */ - case TOK_END: - case TOK_BACKGROUND: { - if (tok_begin >= pos) { - finished = 1; - if (b) *b = (wchar_t *)begin + tok_begin; - } else { - if (a) *a = (wchar_t *)begin + tok_begin + 1; - } - break; - } - default: { - break; - } } - } free(buffcpy); } @@ -380,14 +381,13 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar const wcstring buffcpy = wcstring(cmdsubst_begin, cmdsubst_end - cmdsubst_begin); tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED); - tok_t token; - while (tok.next(&token)) { - size_t tok_begin = token.offset; + while (maybe_t token = tok.next()) { + size_t tok_begin = token->offset; size_t tok_end = tok_begin; // Calculate end of token. - if (token.type == TOK_STRING) { - tok_end += token.length; + if (token->type == token_type_t::string) { + tok_end += token->length; } // Cursor was before beginning of this token, means that the cursor is between two tokens, @@ -399,16 +399,16 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar // If cursor is inside the token, this is the token we are looking for. If so, set a and b // and break. - if (token.type == TOK_STRING && tok_end >= offset_within_cmdsubst) { - a = cmdsubst_begin + token.offset; - b = a + token.length; + if (token->type == token_type_t::string && tok_end >= offset_within_cmdsubst) { + a = cmdsubst_begin + token->offset; + b = a + token->length; break; } // Remember previous string token. - if (token.type == TOK_STRING) { - pa = cmdsubst_begin + token.offset; - pb = pa + token.length; + if (token->type == token_type_t::string) { + pa = cmdsubst_begin + token->offset; + pb = pa + token->length; } } @@ -482,21 +482,20 @@ static wchar_t get_quote(const wcstring &cmd_str, size_t len) { } void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote, - size_t *offset, enum token_type *out_type) { + size_t *offset, token_type_t *out_type) { size_t prev_pos = 0; wchar_t last_quote = L'\0'; tokenizer_t tok(cmd.c_str(), TOK_ACCEPT_UNFINISHED); - tok_t token; - while (tok.next(&token)) { - if (token.offset > pos) break; + while (auto token = tok.next()) { + if (token->offset > pos) break; - if (token.type == TOK_STRING) - last_quote = get_quote(tok.text_of(token), pos - token.offset); + if (token->type == token_type_t::string) + last_quote = get_quote(tok.text_of(*token), pos - token->offset); - if (out_type != NULL) *out_type = token.type; + if (out_type != NULL) *out_type = token->type; - prev_pos = token.offset; + prev_pos = token->offset; } wchar_t *cmd_tmp = wcsdup(cmd.c_str()); diff --git a/src/parse_util.h b/src/parse_util.h index e862b796f..a3e7a849e 100644 --- a/src/parse_util.h +++ b/src/parse_util.h @@ -110,7 +110,7 @@ bool parse_util_argument_is_help(const wchar_t *s); /// \param offset If not NULL, get_param will store the offset to the beginning of the parameter. /// \param out_type If not NULL, get_param will store the token type. void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote, - size_t *offset, enum token_type *out_type); + size_t *offset, token_type_t *out_type); /// Attempts to escape the string 'cmd' using the given quote type, as determined by the quote /// character. The quote can be a single quote or double quote, or L'\0' to indicate no quoting (and diff --git a/src/reader.cpp b/src/reader.cpp index 5377e0647..13713bd92 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -199,12 +199,11 @@ class reader_history_search_t { } else if (mode_ == token) { const wcstring &needle = search_string(); tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED); - tok_t token; wcstring_list_t local_tokens; - while (tok.next(&token)) { - if (token.type != TOK_STRING) continue; - wcstring text = tok.text_of(token); + while (auto token = tok.next()) { + if (token->type != token_type_t::string) continue; + wcstring text = tok.text_of(*token); if (text.find(needle) != wcstring::npos) { local_tokens.emplace_back(std::move(text)); } @@ -2346,11 +2345,11 @@ static wchar_t unescaped_quote(const wcstring &str, size_t pos) { /// Returns true if the last token is a comment. static bool text_ends_in_comment(const wcstring &text) { tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS); - tok_t token; - while (tok.next(&token)) { - ; // pass + bool is_comment = false; + while (auto token = tok.next()) { + is_comment = token->type == token_type_t::comment; } - return token.type == TOK_COMMENT; + return is_comment; } /// \return true if an event is a normal character that should be inserted into the buffer. diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 14287bcf2..5e9235e0a 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -16,6 +16,7 @@ #include "common.h" #include "fallback.h" // IWYU pragma: keep #include "future_feature_flags.h" +#include "tokenizer.h" #include "wutil.h" // IWYU pragma: keep // _(s) is already wgettext(s).c_str(), so let's not convert back to wcstring @@ -64,8 +65,7 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token this->has_next = false; - tok_t result; - result.type = TOK_ERROR; + tok_t result{token_type_t::error}; result.error = error_type; result.offset = token_start - this->start; result.length = this->buff - token_start; @@ -81,15 +81,7 @@ tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), this->show_blank_lines = static_cast(flags & TOK_SHOW_BLANK_LINES); } -bool tokenizer_t::next(struct tok_t *result) { - assert(result != NULL); - maybe_t tok = this->tok_next(); - if (!tok) { - return false; - } - *result = std::move(*tok); - return true; -} +tok_t::tok_t(token_type_t type) : type(type) {} /// Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the /// first character. Hash (#) starts a comment if it's the first character in a token; otherwise it @@ -252,31 +244,30 @@ tok_t tokenizer_t::read_string() { } if ((!this->accept_unfinished) && (mode != tok_modes::regular_text)) { - tok_t error; - if ((mode & tok_modes::char_escape) == tok_modes::char_escape) { - error = this->call_error(tokenizer_error_t::unterminated_escape, buff_start, - this->buff - 1); - } else if ((mode & tok_modes::array_brackets) == tok_modes::array_brackets) { - error = this->call_error(tokenizer_error_t::unterminated_slice, buff_start, - this->start + slice_offset); - } else if ((mode & tok_modes::subshell) == tok_modes::subshell) { + if (mode & tok_modes::char_escape) { + return this->call_error(tokenizer_error_t::unterminated_escape, buff_start, + this->buff - 1); + } else if (mode & tok_modes::array_brackets) { + return this->call_error(tokenizer_error_t::unterminated_slice, buff_start, + this->start + slice_offset); + } else if (mode & tok_modes::subshell) { assert(paran_offsets.size() > 0); size_t offset_of_open_paran = paran_offsets.back(); - error = this->call_error(tokenizer_error_t::unterminated_subshell, buff_start, - this->start + offset_of_open_paran); - } else if ((mode & tok_modes::curly_braces) == tok_modes::curly_braces) { + return this->call_error(tokenizer_error_t::unterminated_subshell, buff_start, + this->start + offset_of_open_paran); + } else if (mode & tok_modes::curly_braces) { assert(brace_offsets.size() > 0); size_t offset_of_open_brace = brace_offsets.back(); - error = this->call_error(tokenizer_error_t::unterminated_brace, buff_start, - this->start + offset_of_open_brace); + return this->call_error(tokenizer_error_t::unterminated_brace, buff_start, + this->start + offset_of_open_brace); + } else { + DIE("Unknown non-regular-text mode"); } - return error; } - tok_t result; - result.type = TOK_STRING; + tok_t result(token_type_t::string); result.offset = buff_start - this->start; result.length = this->buff - buff_start; return result; @@ -289,7 +280,7 @@ struct parsed_redir_or_pipe_t { size_t consumed{0}; // The token type, always either TOK_PIPE or TOK_REDIRECT. - token_type type{TOK_REDIRECT}; + token_type_t type{token_type_t::redirect}; // The redirection mode if the type is TOK_REDIRECT. redirection_type_t redirection_mode{redirection_type_t::overwrite}; @@ -373,7 +364,7 @@ static maybe_t read_redirection_or_fd_pipe(const wchar_t } else if (opt_char == L'|') { // So the string looked like '2>|'. This is not a redirection - it's a pipe! That gets // handled elsewhere. - result.type = TOK_PIPE; + result.type = token_type_t::pipe; idx++; } @@ -384,7 +375,7 @@ static maybe_t read_redirection_or_fd_pipe(const wchar_t maybe_t redirection_type_for_string(const wcstring &str, int *out_fd) { auto v = read_redirection_or_fd_pipe(str.c_str()); // Redirections only, no pipes. - if (!v || v->type != TOK_REDIRECT || v->fd < 0) return none(); + if (!v || v->type != token_type_t::redirect || v->fd < 0) return none(); if (out_fd) *out_fd = v->fd; return v->redirection_mode; } @@ -395,7 +386,7 @@ int fd_redirected_by_pipe(const wcstring &str) { return STDOUT_FILENO; } auto v = read_redirection_or_fd_pipe(str.c_str()); - return (v && v->type == TOK_PIPE) ? v->fd : -1; + return (v && v->type == token_type_t::pipe) ? v->fd : -1; } int oflags_for_redirection_type(redirection_type_t type) { @@ -434,7 +425,7 @@ static bool iswspace_not_nl(wchar_t c) { } } -maybe_t tokenizer_t::tok_next() { +maybe_t tokenizer_t::next() { if (!this->has_next) { return none(); } @@ -464,8 +455,7 @@ maybe_t tokenizer_t::tok_next() { // Maybe return the comment. if (this->show_comments) { - tok_t result; - result.type = TOK_COMMENT; + tok_t result(token_type_t::comment); result.offset = comment_start - this->start; result.length = comment_len; result.preceding_escaped_nl = preceding_escaped_nl; @@ -476,10 +466,9 @@ maybe_t tokenizer_t::tok_next() { // We made it past the comments and ate any trailing newlines we wanted to ignore. this->continue_line_after_comment = false; - size_t start_pos = this->buff - this->start; + const size_t start_pos = this->buff - this->start; - tok_t result; - result.offset = start_pos; + maybe_t result{}; switch (*this->buff) { case L'\0': { this->has_next = false; @@ -488,8 +477,9 @@ maybe_t tokenizer_t::tok_next() { case L'\r': // carriage-return case L'\n': // newline case L';': { - result.type = TOK_END; - result.length = 1; + result.emplace(token_type_t::end); + result->offset = start_pos; + result->length = 1; this->buff++; // Hack: when we get a newline, swallow as many as we can. This compresses multiple // subsequent newlines into a single one. @@ -503,25 +493,29 @@ maybe_t tokenizer_t::tok_next() { } case L'&': { if (this->buff[1] == L'&') { - result.type = TOK_ANDAND; - result.length = 2; + result.emplace(token_type_t::andand); + result->offset = start_pos; + result->length = 2; this->buff += 2; } else { - result.type = TOK_BACKGROUND; - result.length = 1; + result.emplace(token_type_t::background); + result->offset = start_pos; + result->length = 1; this->buff++; } break; } case L'|': { if (this->buff[1] == L'|') { - result.type = TOK_OROR; - result.length = 2; + result.emplace(token_type_t::oror); + result->offset = start_pos; + result->length = 2; this->buff += 2; } else { - result.type = TOK_PIPE; - result.redirected_fd = 1; - result.length = 1; + result.emplace(token_type_t::pipe); + result->redirected_fd = 1; + result->offset = start_pos; + result->length = 1; this->buff++; } break; @@ -535,9 +529,10 @@ maybe_t tokenizer_t::tok_next() { return this->call_error(tokenizer_error_t::invalid_redirect, this->buff, this->buff); } - result.type = redir_or_pipe->type; - result.redirected_fd = redir_or_pipe->fd; - result.length = redir_or_pipe->consumed; + result.emplace(redir_or_pipe->type); + result->offset = start_pos; + result->redirected_fd = redir_or_pipe->fd; + result->length = redir_or_pipe->consumed; this->buff += redir_or_pipe->consumed; break; } @@ -553,13 +548,14 @@ maybe_t tokenizer_t::tok_next() { // It looks like a redirection or a pipe. But we don't support piping fd 0. Note // that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer // error. - if (redir_or_pipe->type == TOK_PIPE && redir_or_pipe->fd == 0) { + if (redir_or_pipe->type == token_type_t::pipe && redir_or_pipe->fd == 0) { return this->call_error(tokenizer_error_t::invalid_pipe, error_location, error_location); } - result.type = redir_or_pipe->type; - result.redirected_fd = redir_or_pipe->fd; - result.length = redir_or_pipe->consumed; + result.emplace(redir_or_pipe->type); + result->redirected_fd = redir_or_pipe->fd; + result->offset = start_pos; + result->length = redir_or_pipe->consumed; this->buff += redir_or_pipe->consumed; } else { // Not a redirection or pipe, so just a string. @@ -568,15 +564,17 @@ maybe_t tokenizer_t::tok_next() { break; } } - result.preceding_escaped_nl = preceding_escaped_nl; + assert(result.has_value() && "Should have a token"); + result->preceding_escaped_nl = preceding_escaped_nl; return result; } wcstring tok_first(const wcstring &str) { tokenizer_t t(str.c_str(), 0); - tok_t token; - if (t.next(&token) && token.type == TOK_STRING) { - return t.text_of(token); + if (auto token = t.next()) { + if (token->type == token_type_t::string) { + return t.text_of(*token); + } } return {}; } diff --git a/src/tokenizer.h b/src/tokenizer.h index 0d0527e86..35b3aeb3e 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -10,17 +10,16 @@ #include "parse_constants.h" /// Token types. -enum token_type { - TOK_NONE, /// Tokenizer not yet constructed - TOK_ERROR, /// Error reading token - TOK_STRING, /// String token - TOK_PIPE, /// Pipe token - TOK_ANDAND, /// && token - TOK_OROR, /// || token - TOK_END, /// End token (semicolon or newline, not literal end) - TOK_REDIRECT, /// redirection token - TOK_BACKGROUND, /// send job to bg token - TOK_COMMENT /// comment token +enum class token_type_t { + error, /// Error reading token + string, /// String token + pipe, /// Pipe token + andand, /// && token + oror, /// || token + end, /// End token (semicolon or newline, not literal end) + redirect, /// redirection token + background, /// send job to bg token + comment, /// comment token }; enum class redirection_type_t { @@ -65,7 +64,7 @@ const wchar_t *tokenizer_get_error_message(tokenizer_error_t err); struct tok_t { // The type of the token. - token_type type{TOK_NONE}; + token_type_t type; // Offset of the token. size_t offset{0}; @@ -85,7 +84,8 @@ struct tok_t { // at 'offset'. size_t error_offset{size_t(-1)}; - tok_t() = default; + // Construct from a token type. + explicit tok_t(token_type_t type); }; /// The tokenizer struct. @@ -112,7 +112,6 @@ class tokenizer_t { tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start, const wchar_t *error_loc); tok_t read_string(); - maybe_t tok_next(); public: /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and @@ -124,8 +123,8 @@ class tokenizer_t { /// token. Setting TOK_SHOW_COMMENTS will return comments as tokens tokenizer_t(const wchar_t *b, tok_flags_t flags); - /// Returns the next token by reference. Returns true if we got one, false if we're at the end. - bool next(struct tok_t *result); + /// Returns the next token, or none() if we are at the end. + maybe_t next(); /// Returns the text of a token, as a string. wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); }