Clean up some tokenization

Remove TOK_NONE
Turn token_type into an enum class
Make next() turn a maybe_t<tok_t> instead of a bool
This commit is contained in:
ridiculousfish
2019-10-13 16:06:16 -07:00
parent 82eca4bc86
commit 1a65e18ba8
9 changed files with 210 additions and 201 deletions

View File

@@ -100,12 +100,11 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
wcstring out;
wcstring buff(begin, end - begin);
tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED);
tok_t token;
while (tok.next(&token)) {
if ((cut_at_cursor) && (token.offset + token.length >= pos)) break;
while (auto token = tok.next()) {
if ((cut_at_cursor) && (token->offset + token->length >= pos)) break;
if (token.type == TOK_STRING) {
wcstring tmp = tok.text_of(token);
if (token->type == token_type_t::string) {
wcstring tmp = tok.text_of(*token);
unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE);
out.append(tmp);
out.push_back(L'\n');

View File

@@ -39,6 +39,12 @@ static void sigint_parent() {
fprintf(stderr, "Sent SIGINT to %d\n", parent);
}
static void print_stdout_stderr() {
fprintf(stdout, "stdout\n");
fprintf(stderr, "stderr\n");
fflush(nullptr);
}
int main(int argc, char *argv[]) {
if (argc <= 1) {
fprintf(stderr, "No commands given.\n");
@@ -51,6 +57,8 @@ int main(int argc, char *argv[]) {
report_foreground();
} else if (!strcmp(argv[i], "sigint_parent")) {
sigint_parent();
} else if (!strcmp(argv[i], "print_stdout_stderr")) {
print_stdout_stderr();
} else {
fprintf(stderr, "%s: Unknown command: %s\n", argv[0], argv[i]);
return EXIT_FAILURE;

View File

@@ -564,29 +564,27 @@ static void test_convert_nulls() {
/// Test the tokenizer.
static void test_tokenizer() {
say(L"Testing tokenizer");
tok_t token;
{
bool got = false;
const wchar_t *str = L"alpha beta";
tokenizer_t t(str, 0);
maybe_t<tok_t> token{};
got = t.next(&token); // alpha
do_test(got);
do_test(token.type == TOK_STRING);
do_test(token.offset == 0);
do_test(token.length == 5);
do_test(t.text_of(token) == L"alpha");
token = t.next(); // alpha
do_test(token.has_value());
do_test(token->type == token_type_t::string);
do_test(token->offset == 0);
do_test(token->length == 5);
do_test(t.text_of(*token) == L"alpha");
got = t.next(&token); // beta
do_test(got);
do_test(token.type == TOK_STRING);
do_test(token.offset == 6);
do_test(token.length == 4);
do_test(t.text_of(token) == L"beta");
token = t.next(); // beta
do_test(token.has_value());
do_test(token->type == token_type_t::string);
do_test(token->offset == 6);
do_test(token->length == 4);
do_test(t.text_of(*token) == L"beta");
got = t.next(&token);
do_test(!got);
token = t.next();
do_test(!token.has_value());
}
const wchar_t *str =
@@ -595,30 +593,31 @@ static void test_tokenizer() {
L"&&& ||| "
L"&& || & |"
L"Compress_Newlines\n \n\t\n \nInto_Just_One";
const int types[] = {TOK_STRING, TOK_REDIRECT, TOK_STRING, TOK_REDIRECT, TOK_STRING,
TOK_STRING, TOK_STRING, TOK_REDIRECT, TOK_REDIRECT, TOK_STRING,
TOK_ANDAND, TOK_BACKGROUND, TOK_OROR, TOK_PIPE, TOK_ANDAND,
TOK_OROR, TOK_BACKGROUND, TOK_PIPE, TOK_STRING, TOK_END,
TOK_STRING};
using tt = token_type_t;
const token_type_t types[] = {
tt::string, tt::redirect, tt::string, tt::redirect, tt::string, tt::string,
tt::string, tt::redirect, tt::redirect, tt::string, tt::andand, tt::background,
tt::oror, tt::pipe, tt::andand, tt::oror, tt::background, tt::pipe,
tt::string, tt::end, tt::string};
say(L"Test correct tokenization");
{
tokenizer_t t(str, 0);
size_t i = 0;
while (t.next(&token)) {
while (auto token = t.next()) {
if (i >= sizeof types / sizeof *types) {
err(L"Too many tokens returned from tokenizer");
std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token.type);
std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token->type);
break;
}
if (types[i] != token.type) {
if (types[i] != token->type) {
err(L"Tokenization error:");
std::fwprintf(
stdout,
L"Token number %zu of string \n'%ls'\n, expected type %ld, got token type "
L"%ld\n",
i + 1, str, (long)types[i], (long)token.type);
i + 1, str, (long)types[i], (long)token->type);
}
i++;
}
@@ -630,37 +629,44 @@ static void test_tokenizer() {
// Test some errors.
{
tokenizer_t t(L"abc\\", 0);
do_test(t.next(&token));
do_test(token.type == TOK_ERROR);
do_test(token.error == tokenizer_error_t::unterminated_escape);
do_test(token.error_offset == 3);
auto token = t.next();
do_test(token.has_value());
do_test(token->type == token_type_t::error);
do_test(token->error == tokenizer_error_t::unterminated_escape);
do_test(token->error_offset == 3);
}
{
tokenizer_t t(L"abc )defg(hij", 0);
do_test(t.next(&token));
do_test(t.next(&token));
do_test(token.type == TOK_ERROR);
do_test(token.error == tokenizer_error_t::closing_unopened_subshell);
do_test(token.error_offset == 4);
auto token = t.next();
do_test(token.has_value());
token = t.next();
do_test(token.has_value());
do_test(token->type == token_type_t::error);
do_test(token->error == tokenizer_error_t::closing_unopened_subshell);
do_test(token->error_offset == 4);
}
{
tokenizer_t t(L"abc defg(hij (klm)", 0);
do_test(t.next(&token));
do_test(t.next(&token));
do_test(token.type == TOK_ERROR);
do_test(token.error == tokenizer_error_t::unterminated_subshell);
do_test(token.error_offset == 4);
auto token = t.next();
do_test(token.has_value());
token = t.next();
do_test(token.has_value());
do_test(token->type == token_type_t::error);
do_test(token->error == tokenizer_error_t::unterminated_subshell);
do_test(token->error_offset == 4);
}
{
tokenizer_t t(L"abc defg[hij (klm)", 0);
do_test(t.next(&token));
do_test(t.next(&token));
do_test(token.type == TOK_ERROR);
do_test(token.error == tokenizer_error_t::unterminated_slice);
do_test(token.error_offset == 4);
auto token = t.next();
do_test(token.has_value());
token = t.next();
do_test(token.has_value());
do_test(token->type == token_type_t::error);
do_test(token->error == tokenizer_error_t::unterminated_slice);
do_test(token->error_offset == 4);
}
// Test redirection_type_for_string.

View File

@@ -18,6 +18,7 @@
#include "flog.h"
#include "parse_constants.h"
#include "parse_productions.h"
#include "parse_tree.h"
#include "proc.h"
#include "tnode.h"
#include "tokenizer.h"
@@ -235,28 +236,25 @@ wcstring parse_token_t::user_presentable_description() const {
/// Convert from tokenizer_t's token type to a parse_token_t type.
static inline parse_token_type_t parse_token_type_from_tokenizer_token(
enum token_type tokenizer_token_type) {
enum token_type_t tokenizer_token_type) {
switch (tokenizer_token_type) {
case TOK_NONE:
DIE("TOK_NONE passed to parse_token_type_from_tokenizer_token");
return token_type_invalid;
case TOK_STRING:
case token_type_t::string:
return parse_token_type_string;
case TOK_PIPE:
case token_type_t::pipe:
return parse_token_type_pipe;
case TOK_ANDAND:
case token_type_t::andand:
return parse_token_type_andand;
case TOK_OROR:
case token_type_t::oror:
return parse_token_type_oror;
case TOK_END:
case token_type_t::end:
return parse_token_type_end;
case TOK_BACKGROUND:
case token_type_t::background:
return parse_token_type_background;
case TOK_REDIRECT:
case token_type_t::redirect:
return parse_token_type_redirection;
case TOK_ERROR:
case token_type_t::error:
return parse_special_type_tokenizer_error;
case TOK_COMMENT:
case token_type_t::comment:
return parse_special_type_comment;
}
FLOGF(error, L"Bad token type %d passed to %s", (int)tokenizer_token_type, __FUNCTION__);
@@ -960,9 +958,9 @@ static bool is_keyword_char(wchar_t c) {
}
/// Given a token, returns the keyword it matches, or parse_keyword_none.
static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token) {
static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token) {
/* Only strings can be keywords */
if (tok != TOK_STRING) {
if (tok != token_type_t::string) {
return parse_keyword_none;
}
@@ -1009,32 +1007,35 @@ static inline bool is_help_argument(const wcstring &txt) {
}
/// Return a new parse token, advancing the tokenizer.
static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token, wcstring *storage) {
if (!tok->next(token)) {
static inline parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token,
wcstring *storage) {
*out_token = tok->next();
if (!out_token->has_value()) {
return kTerminalToken;
}
const tok_t &token = **out_token;
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy,
// because it ignores quotes. This is the historical behavior. For example, `builtin --names`
// lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of
// this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
// even starts to look like a feature.
parse_token_t result{parse_token_type_from_tokenizer_token(token->type)};
const wcstring &text = tok->copy_text_of(*token, storage);
result.keyword = keyword_for_token(token->type, text);
parse_token_t result{parse_token_type_from_tokenizer_token(token.type)};
const wcstring &text = tok->copy_text_of(token, storage);
result.keyword = keyword_for_token(token.type, text);
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
result.preceding_escaped_nl = token->preceding_escaped_nl;
result.preceding_escaped_nl = token.preceding_escaped_nl;
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
// crash.
assert(token->offset < SOURCE_OFFSET_INVALID);
result.source_start = (source_offset_t)token->offset;
assert(token.offset < SOURCE_OFFSET_INVALID);
result.source_start = (source_offset_t)token.offset;
assert(token->length <= SOURCE_OFFSET_INVALID);
result.source_length = (source_offset_t)token->length;
assert(token.length <= SOURCE_OFFSET_INVALID);
result.source_length = (source_offset_t)token.length;
return result;
}
@@ -1063,7 +1064,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
parse_token_t queue[2] = {kInvalidToken, kInvalidToken};
// Loop until we have a terminal token.
tok_t tokenizer_token;
maybe_t<tok_t> tokenizer_token{};
for (size_t token_count = 0; queue[0].type != parse_token_type_terminate; token_count++) {
// Push a new token onto the queue.
queue[0] = queue[1];
@@ -1084,7 +1085,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
// Handle tokenizer errors. This is a hack because really the parser should report this for
// itself; but it has no way of getting the tokenizer message.
if (queue[1].type == parse_special_type_tokenizer_error) {
parser.report_tokenizer_error(tokenizer_token);
parser.report_tokenizer_error(*tokenizer_token);
}
if (!parser.has_fatal_error()) {

View File

@@ -20,6 +20,7 @@
#include "fallback.h" // IWYU pragma: keep
#include "future_feature_flags.h"
#include "parse_constants.h"
#include "parse_util.h"
#include "parser.h"
#include "tnode.h"
#include "tokenizer.h"
@@ -310,32 +311,32 @@ static void job_or_process_extent(const wchar_t *buff, size_t cursor_pos, const
assert(buffcpy != NULL);
tokenizer_t tok(buffcpy, TOK_ACCEPT_UNFINISHED);
tok_t token;
while (tok.next(&token) && !finished) {
size_t tok_begin = token.offset;
for (maybe_t<tok_t> token = tok.next(); token && !finished; token = tok.next())
while ((token = tok.next()) && !finished) {
size_t tok_begin = token->offset;
switch (token.type) {
case TOK_PIPE: {
if (!process) {
switch (token->type) {
case token_type_t::pipe: {
if (!process) {
break;
}
}
/* FALLTHROUGH */
case token_type_t::end:
case token_type_t::background: {
if (tok_begin >= pos) {
finished = 1;
if (b) *b = (wchar_t *)begin + tok_begin;
} else {
if (a) *a = (wchar_t *)begin + tok_begin + 1;
}
break;
}
default: {
break;
}
}
/* FALLTHROUGH */
case TOK_END:
case TOK_BACKGROUND: {
if (tok_begin >= pos) {
finished = 1;
if (b) *b = (wchar_t *)begin + tok_begin;
} else {
if (a) *a = (wchar_t *)begin + tok_begin + 1;
}
break;
}
default: {
break;
}
}
}
free(buffcpy);
}
@@ -380,14 +381,13 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
const wcstring buffcpy = wcstring(cmdsubst_begin, cmdsubst_end - cmdsubst_begin);
tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED);
tok_t token;
while (tok.next(&token)) {
size_t tok_begin = token.offset;
while (maybe_t<tok_t> token = tok.next()) {
size_t tok_begin = token->offset;
size_t tok_end = tok_begin;
// Calculate end of token.
if (token.type == TOK_STRING) {
tok_end += token.length;
if (token->type == token_type_t::string) {
tok_end += token->length;
}
// Cursor was before beginning of this token, means that the cursor is between two tokens,
@@ -399,16 +399,16 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
// If cursor is inside the token, this is the token we are looking for. If so, set a and b
// and break.
if (token.type == TOK_STRING && tok_end >= offset_within_cmdsubst) {
a = cmdsubst_begin + token.offset;
b = a + token.length;
if (token->type == token_type_t::string && tok_end >= offset_within_cmdsubst) {
a = cmdsubst_begin + token->offset;
b = a + token->length;
break;
}
// Remember previous string token.
if (token.type == TOK_STRING) {
pa = cmdsubst_begin + token.offset;
pb = pa + token.length;
if (token->type == token_type_t::string) {
pa = cmdsubst_begin + token->offset;
pb = pa + token->length;
}
}
@@ -482,21 +482,20 @@ static wchar_t get_quote(const wcstring &cmd_str, size_t len) {
}
void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote,
size_t *offset, enum token_type *out_type) {
size_t *offset, token_type_t *out_type) {
size_t prev_pos = 0;
wchar_t last_quote = L'\0';
tokenizer_t tok(cmd.c_str(), TOK_ACCEPT_UNFINISHED);
tok_t token;
while (tok.next(&token)) {
if (token.offset > pos) break;
while (auto token = tok.next()) {
if (token->offset > pos) break;
if (token.type == TOK_STRING)
last_quote = get_quote(tok.text_of(token), pos - token.offset);
if (token->type == token_type_t::string)
last_quote = get_quote(tok.text_of(*token), pos - token->offset);
if (out_type != NULL) *out_type = token.type;
if (out_type != NULL) *out_type = token->type;
prev_pos = token.offset;
prev_pos = token->offset;
}
wchar_t *cmd_tmp = wcsdup(cmd.c_str());

View File

@@ -110,7 +110,7 @@ bool parse_util_argument_is_help(const wchar_t *s);
/// \param offset If not NULL, get_param will store the offset to the beginning of the parameter.
/// \param out_type If not NULL, get_param will store the token type.
void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote,
size_t *offset, enum token_type *out_type);
size_t *offset, token_type_t *out_type);
/// Attempts to escape the string 'cmd' using the given quote type, as determined by the quote
/// character. The quote can be a single quote or double quote, or L'\0' to indicate no quoting (and

View File

@@ -199,12 +199,11 @@ class reader_history_search_t {
} else if (mode_ == token) {
const wcstring &needle = search_string();
tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED);
tok_t token;
wcstring_list_t local_tokens;
while (tok.next(&token)) {
if (token.type != TOK_STRING) continue;
wcstring text = tok.text_of(token);
while (auto token = tok.next()) {
if (token->type != token_type_t::string) continue;
wcstring text = tok.text_of(*token);
if (text.find(needle) != wcstring::npos) {
local_tokens.emplace_back(std::move(text));
}
@@ -2346,11 +2345,11 @@ static wchar_t unescaped_quote(const wcstring &str, size_t pos) {
/// Returns true if the last token is a comment.
static bool text_ends_in_comment(const wcstring &text) {
tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS);
tok_t token;
while (tok.next(&token)) {
; // pass
bool is_comment = false;
while (auto token = tok.next()) {
is_comment = token->type == token_type_t::comment;
}
return token.type == TOK_COMMENT;
return is_comment;
}
/// \return true if an event is a normal character that should be inserted into the buffer.

View File

@@ -16,6 +16,7 @@
#include "common.h"
#include "fallback.h" // IWYU pragma: keep
#include "future_feature_flags.h"
#include "tokenizer.h"
#include "wutil.h" // IWYU pragma: keep
// _(s) is already wgettext(s).c_str(), so let's not convert back to wcstring
@@ -64,8 +65,7 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token
this->has_next = false;
tok_t result;
result.type = TOK_ERROR;
tok_t result{token_type_t::error};
result.error = error_type;
result.offset = token_start - this->start;
result.length = this->buff - token_start;
@@ -81,15 +81,7 @@ tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start),
this->show_blank_lines = static_cast<bool>(flags & TOK_SHOW_BLANK_LINES);
}
bool tokenizer_t::next(struct tok_t *result) {
assert(result != NULL);
maybe_t<tok_t> tok = this->tok_next();
if (!tok) {
return false;
}
*result = std::move(*tok);
return true;
}
tok_t::tok_t(token_type_t type) : type(type) {}
/// Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the
/// first character. Hash (#) starts a comment if it's the first character in a token; otherwise it
@@ -252,31 +244,30 @@ tok_t tokenizer_t::read_string() {
}
if ((!this->accept_unfinished) && (mode != tok_modes::regular_text)) {
tok_t error;
if ((mode & tok_modes::char_escape) == tok_modes::char_escape) {
error = this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
this->buff - 1);
} else if ((mode & tok_modes::array_brackets) == tok_modes::array_brackets) {
error = this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
this->start + slice_offset);
} else if ((mode & tok_modes::subshell) == tok_modes::subshell) {
if (mode & tok_modes::char_escape) {
return this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
this->buff - 1);
} else if (mode & tok_modes::array_brackets) {
return this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
this->start + slice_offset);
} else if (mode & tok_modes::subshell) {
assert(paran_offsets.size() > 0);
size_t offset_of_open_paran = paran_offsets.back();
error = this->call_error(tokenizer_error_t::unterminated_subshell, buff_start,
this->start + offset_of_open_paran);
} else if ((mode & tok_modes::curly_braces) == tok_modes::curly_braces) {
return this->call_error(tokenizer_error_t::unterminated_subshell, buff_start,
this->start + offset_of_open_paran);
} else if (mode & tok_modes::curly_braces) {
assert(brace_offsets.size() > 0);
size_t offset_of_open_brace = brace_offsets.back();
error = this->call_error(tokenizer_error_t::unterminated_brace, buff_start,
this->start + offset_of_open_brace);
return this->call_error(tokenizer_error_t::unterminated_brace, buff_start,
this->start + offset_of_open_brace);
} else {
DIE("Unknown non-regular-text mode");
}
return error;
}
tok_t result;
result.type = TOK_STRING;
tok_t result(token_type_t::string);
result.offset = buff_start - this->start;
result.length = this->buff - buff_start;
return result;
@@ -289,7 +280,7 @@ struct parsed_redir_or_pipe_t {
size_t consumed{0};
// The token type, always either TOK_PIPE or TOK_REDIRECT.
token_type type{TOK_REDIRECT};
token_type_t type{token_type_t::redirect};
// The redirection mode if the type is TOK_REDIRECT.
redirection_type_t redirection_mode{redirection_type_t::overwrite};
@@ -373,7 +364,7 @@ static maybe_t<parsed_redir_or_pipe_t> read_redirection_or_fd_pipe(const wchar_t
} else if (opt_char == L'|') {
// So the string looked like '2>|'. This is not a redirection - it's a pipe! That gets
// handled elsewhere.
result.type = TOK_PIPE;
result.type = token_type_t::pipe;
idx++;
}
@@ -384,7 +375,7 @@ static maybe_t<parsed_redir_or_pipe_t> read_redirection_or_fd_pipe(const wchar_t
maybe_t<redirection_type_t> redirection_type_for_string(const wcstring &str, int *out_fd) {
auto v = read_redirection_or_fd_pipe(str.c_str());
// Redirections only, no pipes.
if (!v || v->type != TOK_REDIRECT || v->fd < 0) return none();
if (!v || v->type != token_type_t::redirect || v->fd < 0) return none();
if (out_fd) *out_fd = v->fd;
return v->redirection_mode;
}
@@ -395,7 +386,7 @@ int fd_redirected_by_pipe(const wcstring &str) {
return STDOUT_FILENO;
}
auto v = read_redirection_or_fd_pipe(str.c_str());
return (v && v->type == TOK_PIPE) ? v->fd : -1;
return (v && v->type == token_type_t::pipe) ? v->fd : -1;
}
int oflags_for_redirection_type(redirection_type_t type) {
@@ -434,7 +425,7 @@ static bool iswspace_not_nl(wchar_t c) {
}
}
maybe_t<tok_t> tokenizer_t::tok_next() {
maybe_t<tok_t> tokenizer_t::next() {
if (!this->has_next) {
return none();
}
@@ -464,8 +455,7 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
// Maybe return the comment.
if (this->show_comments) {
tok_t result;
result.type = TOK_COMMENT;
tok_t result(token_type_t::comment);
result.offset = comment_start - this->start;
result.length = comment_len;
result.preceding_escaped_nl = preceding_escaped_nl;
@@ -476,10 +466,9 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
// We made it past the comments and ate any trailing newlines we wanted to ignore.
this->continue_line_after_comment = false;
size_t start_pos = this->buff - this->start;
const size_t start_pos = this->buff - this->start;
tok_t result;
result.offset = start_pos;
maybe_t<tok_t> result{};
switch (*this->buff) {
case L'\0': {
this->has_next = false;
@@ -488,8 +477,9 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
case L'\r': // carriage-return
case L'\n': // newline
case L';': {
result.type = TOK_END;
result.length = 1;
result.emplace(token_type_t::end);
result->offset = start_pos;
result->length = 1;
this->buff++;
// Hack: when we get a newline, swallow as many as we can. This compresses multiple
// subsequent newlines into a single one.
@@ -503,25 +493,29 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
}
case L'&': {
if (this->buff[1] == L'&') {
result.type = TOK_ANDAND;
result.length = 2;
result.emplace(token_type_t::andand);
result->offset = start_pos;
result->length = 2;
this->buff += 2;
} else {
result.type = TOK_BACKGROUND;
result.length = 1;
result.emplace(token_type_t::background);
result->offset = start_pos;
result->length = 1;
this->buff++;
}
break;
}
case L'|': {
if (this->buff[1] == L'|') {
result.type = TOK_OROR;
result.length = 2;
result.emplace(token_type_t::oror);
result->offset = start_pos;
result->length = 2;
this->buff += 2;
} else {
result.type = TOK_PIPE;
result.redirected_fd = 1;
result.length = 1;
result.emplace(token_type_t::pipe);
result->redirected_fd = 1;
result->offset = start_pos;
result->length = 1;
this->buff++;
}
break;
@@ -535,9 +529,10 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
return this->call_error(tokenizer_error_t::invalid_redirect, this->buff,
this->buff);
}
result.type = redir_or_pipe->type;
result.redirected_fd = redir_or_pipe->fd;
result.length = redir_or_pipe->consumed;
result.emplace(redir_or_pipe->type);
result->offset = start_pos;
result->redirected_fd = redir_or_pipe->fd;
result->length = redir_or_pipe->consumed;
this->buff += redir_or_pipe->consumed;
break;
}
@@ -553,13 +548,14 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
// It looks like a redirection or a pipe. But we don't support piping fd 0. Note
// that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer
// error.
if (redir_or_pipe->type == TOK_PIPE && redir_or_pipe->fd == 0) {
if (redir_or_pipe->type == token_type_t::pipe && redir_or_pipe->fd == 0) {
return this->call_error(tokenizer_error_t::invalid_pipe, error_location,
error_location);
}
result.type = redir_or_pipe->type;
result.redirected_fd = redir_or_pipe->fd;
result.length = redir_or_pipe->consumed;
result.emplace(redir_or_pipe->type);
result->redirected_fd = redir_or_pipe->fd;
result->offset = start_pos;
result->length = redir_or_pipe->consumed;
this->buff += redir_or_pipe->consumed;
} else {
// Not a redirection or pipe, so just a string.
@@ -568,15 +564,17 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
break;
}
}
result.preceding_escaped_nl = preceding_escaped_nl;
assert(result.has_value() && "Should have a token");
result->preceding_escaped_nl = preceding_escaped_nl;
return result;
}
wcstring tok_first(const wcstring &str) {
tokenizer_t t(str.c_str(), 0);
tok_t token;
if (t.next(&token) && token.type == TOK_STRING) {
return t.text_of(token);
if (auto token = t.next()) {
if (token->type == token_type_t::string) {
return t.text_of(*token);
}
}
return {};
}

View File

@@ -10,17 +10,16 @@
#include "parse_constants.h"
/// Token types.
enum token_type {
TOK_NONE, /// Tokenizer not yet constructed
TOK_ERROR, /// Error reading token
TOK_STRING, /// String token
TOK_PIPE, /// Pipe token
TOK_ANDAND, /// && token
TOK_OROR, /// || token
TOK_END, /// End token (semicolon or newline, not literal end)
TOK_REDIRECT, /// redirection token
TOK_BACKGROUND, /// send job to bg token
TOK_COMMENT /// comment token
enum class token_type_t {
error, /// Error reading token
string, /// String token
pipe, /// Pipe token
andand, /// && token
oror, /// || token
end, /// End token (semicolon or newline, not literal end)
redirect, /// redirection token
background, /// send job to bg token
comment, /// comment token
};
enum class redirection_type_t {
@@ -65,7 +64,7 @@ const wchar_t *tokenizer_get_error_message(tokenizer_error_t err);
struct tok_t {
// The type of the token.
token_type type{TOK_NONE};
token_type_t type;
// Offset of the token.
size_t offset{0};
@@ -85,7 +84,8 @@ struct tok_t {
// at 'offset'.
size_t error_offset{size_t(-1)};
tok_t() = default;
// Construct from a token type.
explicit tok_t(token_type_t type);
};
/// The tokenizer struct.
@@ -112,7 +112,6 @@ class tokenizer_t {
tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start,
const wchar_t *error_loc);
tok_t read_string();
maybe_t<tok_t> tok_next();
public:
/// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
@@ -124,8 +123,8 @@ class tokenizer_t {
/// token. Setting TOK_SHOW_COMMENTS will return comments as tokens
tokenizer_t(const wchar_t *b, tok_flags_t flags);
/// Returns the next token by reference. Returns true if we got one, false if we're at the end.
bool next(struct tok_t *result);
/// Returns the next token, or none() if we are at the end.
maybe_t<tok_t> next();
/// Returns the text of a token, as a string.
wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); }