mirror of
https://github.com/fish-shell/fish-shell.git
synced 2026-05-23 04:51:16 -03:00
Clean up some tokenization
Remove TOK_NONE Turn token_type into an enum class Make next() turn a maybe_t<tok_t> instead of a bool
This commit is contained in:
@@ -100,12 +100,11 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
|
||||
wcstring out;
|
||||
wcstring buff(begin, end - begin);
|
||||
tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED);
|
||||
tok_t token;
|
||||
while (tok.next(&token)) {
|
||||
if ((cut_at_cursor) && (token.offset + token.length >= pos)) break;
|
||||
while (auto token = tok.next()) {
|
||||
if ((cut_at_cursor) && (token->offset + token->length >= pos)) break;
|
||||
|
||||
if (token.type == TOK_STRING) {
|
||||
wcstring tmp = tok.text_of(token);
|
||||
if (token->type == token_type_t::string) {
|
||||
wcstring tmp = tok.text_of(*token);
|
||||
unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE);
|
||||
out.append(tmp);
|
||||
out.push_back(L'\n');
|
||||
|
||||
@@ -39,6 +39,12 @@ static void sigint_parent() {
|
||||
fprintf(stderr, "Sent SIGINT to %d\n", parent);
|
||||
}
|
||||
|
||||
static void print_stdout_stderr() {
|
||||
fprintf(stdout, "stdout\n");
|
||||
fprintf(stderr, "stderr\n");
|
||||
fflush(nullptr);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc <= 1) {
|
||||
fprintf(stderr, "No commands given.\n");
|
||||
@@ -51,6 +57,8 @@ int main(int argc, char *argv[]) {
|
||||
report_foreground();
|
||||
} else if (!strcmp(argv[i], "sigint_parent")) {
|
||||
sigint_parent();
|
||||
} else if (!strcmp(argv[i], "print_stdout_stderr")) {
|
||||
print_stdout_stderr();
|
||||
} else {
|
||||
fprintf(stderr, "%s: Unknown command: %s\n", argv[0], argv[i]);
|
||||
return EXIT_FAILURE;
|
||||
|
||||
@@ -564,29 +564,27 @@ static void test_convert_nulls() {
|
||||
/// Test the tokenizer.
|
||||
static void test_tokenizer() {
|
||||
say(L"Testing tokenizer");
|
||||
tok_t token;
|
||||
|
||||
{
|
||||
bool got = false;
|
||||
const wchar_t *str = L"alpha beta";
|
||||
tokenizer_t t(str, 0);
|
||||
maybe_t<tok_t> token{};
|
||||
|
||||
got = t.next(&token); // alpha
|
||||
do_test(got);
|
||||
do_test(token.type == TOK_STRING);
|
||||
do_test(token.offset == 0);
|
||||
do_test(token.length == 5);
|
||||
do_test(t.text_of(token) == L"alpha");
|
||||
token = t.next(); // alpha
|
||||
do_test(token.has_value());
|
||||
do_test(token->type == token_type_t::string);
|
||||
do_test(token->offset == 0);
|
||||
do_test(token->length == 5);
|
||||
do_test(t.text_of(*token) == L"alpha");
|
||||
|
||||
got = t.next(&token); // beta
|
||||
do_test(got);
|
||||
do_test(token.type == TOK_STRING);
|
||||
do_test(token.offset == 6);
|
||||
do_test(token.length == 4);
|
||||
do_test(t.text_of(token) == L"beta");
|
||||
token = t.next(); // beta
|
||||
do_test(token.has_value());
|
||||
do_test(token->type == token_type_t::string);
|
||||
do_test(token->offset == 6);
|
||||
do_test(token->length == 4);
|
||||
do_test(t.text_of(*token) == L"beta");
|
||||
|
||||
got = t.next(&token);
|
||||
do_test(!got);
|
||||
token = t.next();
|
||||
do_test(!token.has_value());
|
||||
}
|
||||
|
||||
const wchar_t *str =
|
||||
@@ -595,30 +593,31 @@ static void test_tokenizer() {
|
||||
L"&&& ||| "
|
||||
L"&& || & |"
|
||||
L"Compress_Newlines\n \n\t\n \nInto_Just_One";
|
||||
const int types[] = {TOK_STRING, TOK_REDIRECT, TOK_STRING, TOK_REDIRECT, TOK_STRING,
|
||||
TOK_STRING, TOK_STRING, TOK_REDIRECT, TOK_REDIRECT, TOK_STRING,
|
||||
TOK_ANDAND, TOK_BACKGROUND, TOK_OROR, TOK_PIPE, TOK_ANDAND,
|
||||
TOK_OROR, TOK_BACKGROUND, TOK_PIPE, TOK_STRING, TOK_END,
|
||||
TOK_STRING};
|
||||
using tt = token_type_t;
|
||||
const token_type_t types[] = {
|
||||
tt::string, tt::redirect, tt::string, tt::redirect, tt::string, tt::string,
|
||||
tt::string, tt::redirect, tt::redirect, tt::string, tt::andand, tt::background,
|
||||
tt::oror, tt::pipe, tt::andand, tt::oror, tt::background, tt::pipe,
|
||||
tt::string, tt::end, tt::string};
|
||||
|
||||
say(L"Test correct tokenization");
|
||||
|
||||
{
|
||||
tokenizer_t t(str, 0);
|
||||
size_t i = 0;
|
||||
while (t.next(&token)) {
|
||||
while (auto token = t.next()) {
|
||||
if (i >= sizeof types / sizeof *types) {
|
||||
err(L"Too many tokens returned from tokenizer");
|
||||
std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token.type);
|
||||
std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token->type);
|
||||
break;
|
||||
}
|
||||
if (types[i] != token.type) {
|
||||
if (types[i] != token->type) {
|
||||
err(L"Tokenization error:");
|
||||
std::fwprintf(
|
||||
stdout,
|
||||
L"Token number %zu of string \n'%ls'\n, expected type %ld, got token type "
|
||||
L"%ld\n",
|
||||
i + 1, str, (long)types[i], (long)token.type);
|
||||
i + 1, str, (long)types[i], (long)token->type);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
@@ -630,37 +629,44 @@ static void test_tokenizer() {
|
||||
// Test some errors.
|
||||
{
|
||||
tokenizer_t t(L"abc\\", 0);
|
||||
do_test(t.next(&token));
|
||||
do_test(token.type == TOK_ERROR);
|
||||
do_test(token.error == tokenizer_error_t::unterminated_escape);
|
||||
do_test(token.error_offset == 3);
|
||||
auto token = t.next();
|
||||
do_test(token.has_value());
|
||||
do_test(token->type == token_type_t::error);
|
||||
do_test(token->error == tokenizer_error_t::unterminated_escape);
|
||||
do_test(token->error_offset == 3);
|
||||
}
|
||||
|
||||
{
|
||||
tokenizer_t t(L"abc )defg(hij", 0);
|
||||
do_test(t.next(&token));
|
||||
do_test(t.next(&token));
|
||||
do_test(token.type == TOK_ERROR);
|
||||
do_test(token.error == tokenizer_error_t::closing_unopened_subshell);
|
||||
do_test(token.error_offset == 4);
|
||||
auto token = t.next();
|
||||
do_test(token.has_value());
|
||||
token = t.next();
|
||||
do_test(token.has_value());
|
||||
do_test(token->type == token_type_t::error);
|
||||
do_test(token->error == tokenizer_error_t::closing_unopened_subshell);
|
||||
do_test(token->error_offset == 4);
|
||||
}
|
||||
|
||||
{
|
||||
tokenizer_t t(L"abc defg(hij (klm)", 0);
|
||||
do_test(t.next(&token));
|
||||
do_test(t.next(&token));
|
||||
do_test(token.type == TOK_ERROR);
|
||||
do_test(token.error == tokenizer_error_t::unterminated_subshell);
|
||||
do_test(token.error_offset == 4);
|
||||
auto token = t.next();
|
||||
do_test(token.has_value());
|
||||
token = t.next();
|
||||
do_test(token.has_value());
|
||||
do_test(token->type == token_type_t::error);
|
||||
do_test(token->error == tokenizer_error_t::unterminated_subshell);
|
||||
do_test(token->error_offset == 4);
|
||||
}
|
||||
|
||||
{
|
||||
tokenizer_t t(L"abc defg[hij (klm)", 0);
|
||||
do_test(t.next(&token));
|
||||
do_test(t.next(&token));
|
||||
do_test(token.type == TOK_ERROR);
|
||||
do_test(token.error == tokenizer_error_t::unterminated_slice);
|
||||
do_test(token.error_offset == 4);
|
||||
auto token = t.next();
|
||||
do_test(token.has_value());
|
||||
token = t.next();
|
||||
do_test(token.has_value());
|
||||
do_test(token->type == token_type_t::error);
|
||||
do_test(token->error == tokenizer_error_t::unterminated_slice);
|
||||
do_test(token->error_offset == 4);
|
||||
}
|
||||
|
||||
// Test redirection_type_for_string.
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "flog.h"
|
||||
#include "parse_constants.h"
|
||||
#include "parse_productions.h"
|
||||
#include "parse_tree.h"
|
||||
#include "proc.h"
|
||||
#include "tnode.h"
|
||||
#include "tokenizer.h"
|
||||
@@ -235,28 +236,25 @@ wcstring parse_token_t::user_presentable_description() const {
|
||||
|
||||
/// Convert from tokenizer_t's token type to a parse_token_t type.
|
||||
static inline parse_token_type_t parse_token_type_from_tokenizer_token(
|
||||
enum token_type tokenizer_token_type) {
|
||||
enum token_type_t tokenizer_token_type) {
|
||||
switch (tokenizer_token_type) {
|
||||
case TOK_NONE:
|
||||
DIE("TOK_NONE passed to parse_token_type_from_tokenizer_token");
|
||||
return token_type_invalid;
|
||||
case TOK_STRING:
|
||||
case token_type_t::string:
|
||||
return parse_token_type_string;
|
||||
case TOK_PIPE:
|
||||
case token_type_t::pipe:
|
||||
return parse_token_type_pipe;
|
||||
case TOK_ANDAND:
|
||||
case token_type_t::andand:
|
||||
return parse_token_type_andand;
|
||||
case TOK_OROR:
|
||||
case token_type_t::oror:
|
||||
return parse_token_type_oror;
|
||||
case TOK_END:
|
||||
case token_type_t::end:
|
||||
return parse_token_type_end;
|
||||
case TOK_BACKGROUND:
|
||||
case token_type_t::background:
|
||||
return parse_token_type_background;
|
||||
case TOK_REDIRECT:
|
||||
case token_type_t::redirect:
|
||||
return parse_token_type_redirection;
|
||||
case TOK_ERROR:
|
||||
case token_type_t::error:
|
||||
return parse_special_type_tokenizer_error;
|
||||
case TOK_COMMENT:
|
||||
case token_type_t::comment:
|
||||
return parse_special_type_comment;
|
||||
}
|
||||
FLOGF(error, L"Bad token type %d passed to %s", (int)tokenizer_token_type, __FUNCTION__);
|
||||
@@ -960,9 +958,9 @@ static bool is_keyword_char(wchar_t c) {
|
||||
}
|
||||
|
||||
/// Given a token, returns the keyword it matches, or parse_keyword_none.
|
||||
static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token) {
|
||||
static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token) {
|
||||
/* Only strings can be keywords */
|
||||
if (tok != TOK_STRING) {
|
||||
if (tok != token_type_t::string) {
|
||||
return parse_keyword_none;
|
||||
}
|
||||
|
||||
@@ -1009,32 +1007,35 @@ static inline bool is_help_argument(const wcstring &txt) {
|
||||
}
|
||||
|
||||
/// Return a new parse token, advancing the tokenizer.
|
||||
static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token, wcstring *storage) {
|
||||
if (!tok->next(token)) {
|
||||
static inline parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token,
|
||||
wcstring *storage) {
|
||||
*out_token = tok->next();
|
||||
if (!out_token->has_value()) {
|
||||
return kTerminalToken;
|
||||
}
|
||||
const tok_t &token = **out_token;
|
||||
|
||||
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy,
|
||||
// because it ignores quotes. This is the historical behavior. For example, `builtin --names`
|
||||
// lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of
|
||||
// this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
|
||||
// even starts to look like a feature.
|
||||
parse_token_t result{parse_token_type_from_tokenizer_token(token->type)};
|
||||
const wcstring &text = tok->copy_text_of(*token, storage);
|
||||
result.keyword = keyword_for_token(token->type, text);
|
||||
parse_token_t result{parse_token_type_from_tokenizer_token(token.type)};
|
||||
const wcstring &text = tok->copy_text_of(token, storage);
|
||||
result.keyword = keyword_for_token(token.type, text);
|
||||
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
|
||||
result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
|
||||
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
|
||||
result.preceding_escaped_nl = token->preceding_escaped_nl;
|
||||
result.preceding_escaped_nl = token.preceding_escaped_nl;
|
||||
|
||||
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
|
||||
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
|
||||
// crash.
|
||||
assert(token->offset < SOURCE_OFFSET_INVALID);
|
||||
result.source_start = (source_offset_t)token->offset;
|
||||
assert(token.offset < SOURCE_OFFSET_INVALID);
|
||||
result.source_start = (source_offset_t)token.offset;
|
||||
|
||||
assert(token->length <= SOURCE_OFFSET_INVALID);
|
||||
result.source_length = (source_offset_t)token->length;
|
||||
assert(token.length <= SOURCE_OFFSET_INVALID);
|
||||
result.source_length = (source_offset_t)token.length;
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -1063,7 +1064,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
|
||||
parse_token_t queue[2] = {kInvalidToken, kInvalidToken};
|
||||
|
||||
// Loop until we have a terminal token.
|
||||
tok_t tokenizer_token;
|
||||
maybe_t<tok_t> tokenizer_token{};
|
||||
for (size_t token_count = 0; queue[0].type != parse_token_type_terminate; token_count++) {
|
||||
// Push a new token onto the queue.
|
||||
queue[0] = queue[1];
|
||||
@@ -1084,7 +1085,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
|
||||
// Handle tokenizer errors. This is a hack because really the parser should report this for
|
||||
// itself; but it has no way of getting the tokenizer message.
|
||||
if (queue[1].type == parse_special_type_tokenizer_error) {
|
||||
parser.report_tokenizer_error(tokenizer_token);
|
||||
parser.report_tokenizer_error(*tokenizer_token);
|
||||
}
|
||||
|
||||
if (!parser.has_fatal_error()) {
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "fallback.h" // IWYU pragma: keep
|
||||
#include "future_feature_flags.h"
|
||||
#include "parse_constants.h"
|
||||
#include "parse_util.h"
|
||||
#include "parser.h"
|
||||
#include "tnode.h"
|
||||
#include "tokenizer.h"
|
||||
@@ -310,32 +311,32 @@ static void job_or_process_extent(const wchar_t *buff, size_t cursor_pos, const
|
||||
assert(buffcpy != NULL);
|
||||
|
||||
tokenizer_t tok(buffcpy, TOK_ACCEPT_UNFINISHED);
|
||||
tok_t token;
|
||||
while (tok.next(&token) && !finished) {
|
||||
size_t tok_begin = token.offset;
|
||||
for (maybe_t<tok_t> token = tok.next(); token && !finished; token = tok.next())
|
||||
while ((token = tok.next()) && !finished) {
|
||||
size_t tok_begin = token->offset;
|
||||
|
||||
switch (token.type) {
|
||||
case TOK_PIPE: {
|
||||
if (!process) {
|
||||
switch (token->type) {
|
||||
case token_type_t::pipe: {
|
||||
if (!process) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* FALLTHROUGH */
|
||||
case token_type_t::end:
|
||||
case token_type_t::background: {
|
||||
if (tok_begin >= pos) {
|
||||
finished = 1;
|
||||
if (b) *b = (wchar_t *)begin + tok_begin;
|
||||
} else {
|
||||
if (a) *a = (wchar_t *)begin + tok_begin + 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* FALLTHROUGH */
|
||||
case TOK_END:
|
||||
case TOK_BACKGROUND: {
|
||||
if (tok_begin >= pos) {
|
||||
finished = 1;
|
||||
if (b) *b = (wchar_t *)begin + tok_begin;
|
||||
} else {
|
||||
if (a) *a = (wchar_t *)begin + tok_begin + 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(buffcpy);
|
||||
}
|
||||
@@ -380,14 +381,13 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
|
||||
const wcstring buffcpy = wcstring(cmdsubst_begin, cmdsubst_end - cmdsubst_begin);
|
||||
|
||||
tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED);
|
||||
tok_t token;
|
||||
while (tok.next(&token)) {
|
||||
size_t tok_begin = token.offset;
|
||||
while (maybe_t<tok_t> token = tok.next()) {
|
||||
size_t tok_begin = token->offset;
|
||||
size_t tok_end = tok_begin;
|
||||
|
||||
// Calculate end of token.
|
||||
if (token.type == TOK_STRING) {
|
||||
tok_end += token.length;
|
||||
if (token->type == token_type_t::string) {
|
||||
tok_end += token->length;
|
||||
}
|
||||
|
||||
// Cursor was before beginning of this token, means that the cursor is between two tokens,
|
||||
@@ -399,16 +399,16 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
|
||||
|
||||
// If cursor is inside the token, this is the token we are looking for. If so, set a and b
|
||||
// and break.
|
||||
if (token.type == TOK_STRING && tok_end >= offset_within_cmdsubst) {
|
||||
a = cmdsubst_begin + token.offset;
|
||||
b = a + token.length;
|
||||
if (token->type == token_type_t::string && tok_end >= offset_within_cmdsubst) {
|
||||
a = cmdsubst_begin + token->offset;
|
||||
b = a + token->length;
|
||||
break;
|
||||
}
|
||||
|
||||
// Remember previous string token.
|
||||
if (token.type == TOK_STRING) {
|
||||
pa = cmdsubst_begin + token.offset;
|
||||
pb = pa + token.length;
|
||||
if (token->type == token_type_t::string) {
|
||||
pa = cmdsubst_begin + token->offset;
|
||||
pb = pa + token->length;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -482,21 +482,20 @@ static wchar_t get_quote(const wcstring &cmd_str, size_t len) {
|
||||
}
|
||||
|
||||
void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote,
|
||||
size_t *offset, enum token_type *out_type) {
|
||||
size_t *offset, token_type_t *out_type) {
|
||||
size_t prev_pos = 0;
|
||||
wchar_t last_quote = L'\0';
|
||||
|
||||
tokenizer_t tok(cmd.c_str(), TOK_ACCEPT_UNFINISHED);
|
||||
tok_t token;
|
||||
while (tok.next(&token)) {
|
||||
if (token.offset > pos) break;
|
||||
while (auto token = tok.next()) {
|
||||
if (token->offset > pos) break;
|
||||
|
||||
if (token.type == TOK_STRING)
|
||||
last_quote = get_quote(tok.text_of(token), pos - token.offset);
|
||||
if (token->type == token_type_t::string)
|
||||
last_quote = get_quote(tok.text_of(*token), pos - token->offset);
|
||||
|
||||
if (out_type != NULL) *out_type = token.type;
|
||||
if (out_type != NULL) *out_type = token->type;
|
||||
|
||||
prev_pos = token.offset;
|
||||
prev_pos = token->offset;
|
||||
}
|
||||
|
||||
wchar_t *cmd_tmp = wcsdup(cmd.c_str());
|
||||
|
||||
@@ -110,7 +110,7 @@ bool parse_util_argument_is_help(const wchar_t *s);
|
||||
/// \param offset If not NULL, get_param will store the offset to the beginning of the parameter.
|
||||
/// \param out_type If not NULL, get_param will store the token type.
|
||||
void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_t *quote,
|
||||
size_t *offset, enum token_type *out_type);
|
||||
size_t *offset, token_type_t *out_type);
|
||||
|
||||
/// Attempts to escape the string 'cmd' using the given quote type, as determined by the quote
|
||||
/// character. The quote can be a single quote or double quote, or L'\0' to indicate no quoting (and
|
||||
|
||||
@@ -199,12 +199,11 @@ class reader_history_search_t {
|
||||
} else if (mode_ == token) {
|
||||
const wcstring &needle = search_string();
|
||||
tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED);
|
||||
tok_t token;
|
||||
|
||||
wcstring_list_t local_tokens;
|
||||
while (tok.next(&token)) {
|
||||
if (token.type != TOK_STRING) continue;
|
||||
wcstring text = tok.text_of(token);
|
||||
while (auto token = tok.next()) {
|
||||
if (token->type != token_type_t::string) continue;
|
||||
wcstring text = tok.text_of(*token);
|
||||
if (text.find(needle) != wcstring::npos) {
|
||||
local_tokens.emplace_back(std::move(text));
|
||||
}
|
||||
@@ -2346,11 +2345,11 @@ static wchar_t unescaped_quote(const wcstring &str, size_t pos) {
|
||||
/// Returns true if the last token is a comment.
|
||||
static bool text_ends_in_comment(const wcstring &text) {
|
||||
tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS);
|
||||
tok_t token;
|
||||
while (tok.next(&token)) {
|
||||
; // pass
|
||||
bool is_comment = false;
|
||||
while (auto token = tok.next()) {
|
||||
is_comment = token->type == token_type_t::comment;
|
||||
}
|
||||
return token.type == TOK_COMMENT;
|
||||
return is_comment;
|
||||
}
|
||||
|
||||
/// \return true if an event is a normal character that should be inserted into the buffer.
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "common.h"
|
||||
#include "fallback.h" // IWYU pragma: keep
|
||||
#include "future_feature_flags.h"
|
||||
#include "tokenizer.h"
|
||||
#include "wutil.h" // IWYU pragma: keep
|
||||
|
||||
// _(s) is already wgettext(s).c_str(), so let's not convert back to wcstring
|
||||
@@ -64,8 +65,7 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token
|
||||
|
||||
this->has_next = false;
|
||||
|
||||
tok_t result;
|
||||
result.type = TOK_ERROR;
|
||||
tok_t result{token_type_t::error};
|
||||
result.error = error_type;
|
||||
result.offset = token_start - this->start;
|
||||
result.length = this->buff - token_start;
|
||||
@@ -81,15 +81,7 @@ tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start),
|
||||
this->show_blank_lines = static_cast<bool>(flags & TOK_SHOW_BLANK_LINES);
|
||||
}
|
||||
|
||||
bool tokenizer_t::next(struct tok_t *result) {
|
||||
assert(result != NULL);
|
||||
maybe_t<tok_t> tok = this->tok_next();
|
||||
if (!tok) {
|
||||
return false;
|
||||
}
|
||||
*result = std::move(*tok);
|
||||
return true;
|
||||
}
|
||||
tok_t::tok_t(token_type_t type) : type(type) {}
|
||||
|
||||
/// Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the
|
||||
/// first character. Hash (#) starts a comment if it's the first character in a token; otherwise it
|
||||
@@ -252,31 +244,30 @@ tok_t tokenizer_t::read_string() {
|
||||
}
|
||||
|
||||
if ((!this->accept_unfinished) && (mode != tok_modes::regular_text)) {
|
||||
tok_t error;
|
||||
if ((mode & tok_modes::char_escape) == tok_modes::char_escape) {
|
||||
error = this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
|
||||
this->buff - 1);
|
||||
} else if ((mode & tok_modes::array_brackets) == tok_modes::array_brackets) {
|
||||
error = this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
|
||||
this->start + slice_offset);
|
||||
} else if ((mode & tok_modes::subshell) == tok_modes::subshell) {
|
||||
if (mode & tok_modes::char_escape) {
|
||||
return this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
|
||||
this->buff - 1);
|
||||
} else if (mode & tok_modes::array_brackets) {
|
||||
return this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
|
||||
this->start + slice_offset);
|
||||
} else if (mode & tok_modes::subshell) {
|
||||
assert(paran_offsets.size() > 0);
|
||||
size_t offset_of_open_paran = paran_offsets.back();
|
||||
|
||||
error = this->call_error(tokenizer_error_t::unterminated_subshell, buff_start,
|
||||
this->start + offset_of_open_paran);
|
||||
} else if ((mode & tok_modes::curly_braces) == tok_modes::curly_braces) {
|
||||
return this->call_error(tokenizer_error_t::unterminated_subshell, buff_start,
|
||||
this->start + offset_of_open_paran);
|
||||
} else if (mode & tok_modes::curly_braces) {
|
||||
assert(brace_offsets.size() > 0);
|
||||
size_t offset_of_open_brace = brace_offsets.back();
|
||||
|
||||
error = this->call_error(tokenizer_error_t::unterminated_brace, buff_start,
|
||||
this->start + offset_of_open_brace);
|
||||
return this->call_error(tokenizer_error_t::unterminated_brace, buff_start,
|
||||
this->start + offset_of_open_brace);
|
||||
} else {
|
||||
DIE("Unknown non-regular-text mode");
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
tok_t result;
|
||||
result.type = TOK_STRING;
|
||||
tok_t result(token_type_t::string);
|
||||
result.offset = buff_start - this->start;
|
||||
result.length = this->buff - buff_start;
|
||||
return result;
|
||||
@@ -289,7 +280,7 @@ struct parsed_redir_or_pipe_t {
|
||||
size_t consumed{0};
|
||||
|
||||
// The token type, always either TOK_PIPE or TOK_REDIRECT.
|
||||
token_type type{TOK_REDIRECT};
|
||||
token_type_t type{token_type_t::redirect};
|
||||
|
||||
// The redirection mode if the type is TOK_REDIRECT.
|
||||
redirection_type_t redirection_mode{redirection_type_t::overwrite};
|
||||
@@ -373,7 +364,7 @@ static maybe_t<parsed_redir_or_pipe_t> read_redirection_or_fd_pipe(const wchar_t
|
||||
} else if (opt_char == L'|') {
|
||||
// So the string looked like '2>|'. This is not a redirection - it's a pipe! That gets
|
||||
// handled elsewhere.
|
||||
result.type = TOK_PIPE;
|
||||
result.type = token_type_t::pipe;
|
||||
idx++;
|
||||
}
|
||||
|
||||
@@ -384,7 +375,7 @@ static maybe_t<parsed_redir_or_pipe_t> read_redirection_or_fd_pipe(const wchar_t
|
||||
maybe_t<redirection_type_t> redirection_type_for_string(const wcstring &str, int *out_fd) {
|
||||
auto v = read_redirection_or_fd_pipe(str.c_str());
|
||||
// Redirections only, no pipes.
|
||||
if (!v || v->type != TOK_REDIRECT || v->fd < 0) return none();
|
||||
if (!v || v->type != token_type_t::redirect || v->fd < 0) return none();
|
||||
if (out_fd) *out_fd = v->fd;
|
||||
return v->redirection_mode;
|
||||
}
|
||||
@@ -395,7 +386,7 @@ int fd_redirected_by_pipe(const wcstring &str) {
|
||||
return STDOUT_FILENO;
|
||||
}
|
||||
auto v = read_redirection_or_fd_pipe(str.c_str());
|
||||
return (v && v->type == TOK_PIPE) ? v->fd : -1;
|
||||
return (v && v->type == token_type_t::pipe) ? v->fd : -1;
|
||||
}
|
||||
|
||||
int oflags_for_redirection_type(redirection_type_t type) {
|
||||
@@ -434,7 +425,7 @@ static bool iswspace_not_nl(wchar_t c) {
|
||||
}
|
||||
}
|
||||
|
||||
maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
maybe_t<tok_t> tokenizer_t::next() {
|
||||
if (!this->has_next) {
|
||||
return none();
|
||||
}
|
||||
@@ -464,8 +455,7 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
|
||||
// Maybe return the comment.
|
||||
if (this->show_comments) {
|
||||
tok_t result;
|
||||
result.type = TOK_COMMENT;
|
||||
tok_t result(token_type_t::comment);
|
||||
result.offset = comment_start - this->start;
|
||||
result.length = comment_len;
|
||||
result.preceding_escaped_nl = preceding_escaped_nl;
|
||||
@@ -476,10 +466,9 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
|
||||
// We made it past the comments and ate any trailing newlines we wanted to ignore.
|
||||
this->continue_line_after_comment = false;
|
||||
size_t start_pos = this->buff - this->start;
|
||||
const size_t start_pos = this->buff - this->start;
|
||||
|
||||
tok_t result;
|
||||
result.offset = start_pos;
|
||||
maybe_t<tok_t> result{};
|
||||
switch (*this->buff) {
|
||||
case L'\0': {
|
||||
this->has_next = false;
|
||||
@@ -488,8 +477,9 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
case L'\r': // carriage-return
|
||||
case L'\n': // newline
|
||||
case L';': {
|
||||
result.type = TOK_END;
|
||||
result.length = 1;
|
||||
result.emplace(token_type_t::end);
|
||||
result->offset = start_pos;
|
||||
result->length = 1;
|
||||
this->buff++;
|
||||
// Hack: when we get a newline, swallow as many as we can. This compresses multiple
|
||||
// subsequent newlines into a single one.
|
||||
@@ -503,25 +493,29 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
}
|
||||
case L'&': {
|
||||
if (this->buff[1] == L'&') {
|
||||
result.type = TOK_ANDAND;
|
||||
result.length = 2;
|
||||
result.emplace(token_type_t::andand);
|
||||
result->offset = start_pos;
|
||||
result->length = 2;
|
||||
this->buff += 2;
|
||||
} else {
|
||||
result.type = TOK_BACKGROUND;
|
||||
result.length = 1;
|
||||
result.emplace(token_type_t::background);
|
||||
result->offset = start_pos;
|
||||
result->length = 1;
|
||||
this->buff++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case L'|': {
|
||||
if (this->buff[1] == L'|') {
|
||||
result.type = TOK_OROR;
|
||||
result.length = 2;
|
||||
result.emplace(token_type_t::oror);
|
||||
result->offset = start_pos;
|
||||
result->length = 2;
|
||||
this->buff += 2;
|
||||
} else {
|
||||
result.type = TOK_PIPE;
|
||||
result.redirected_fd = 1;
|
||||
result.length = 1;
|
||||
result.emplace(token_type_t::pipe);
|
||||
result->redirected_fd = 1;
|
||||
result->offset = start_pos;
|
||||
result->length = 1;
|
||||
this->buff++;
|
||||
}
|
||||
break;
|
||||
@@ -535,9 +529,10 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
return this->call_error(tokenizer_error_t::invalid_redirect, this->buff,
|
||||
this->buff);
|
||||
}
|
||||
result.type = redir_or_pipe->type;
|
||||
result.redirected_fd = redir_or_pipe->fd;
|
||||
result.length = redir_or_pipe->consumed;
|
||||
result.emplace(redir_or_pipe->type);
|
||||
result->offset = start_pos;
|
||||
result->redirected_fd = redir_or_pipe->fd;
|
||||
result->length = redir_or_pipe->consumed;
|
||||
this->buff += redir_or_pipe->consumed;
|
||||
break;
|
||||
}
|
||||
@@ -553,13 +548,14 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
// It looks like a redirection or a pipe. But we don't support piping fd 0. Note
|
||||
// that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer
|
||||
// error.
|
||||
if (redir_or_pipe->type == TOK_PIPE && redir_or_pipe->fd == 0) {
|
||||
if (redir_or_pipe->type == token_type_t::pipe && redir_or_pipe->fd == 0) {
|
||||
return this->call_error(tokenizer_error_t::invalid_pipe, error_location,
|
||||
error_location);
|
||||
}
|
||||
result.type = redir_or_pipe->type;
|
||||
result.redirected_fd = redir_or_pipe->fd;
|
||||
result.length = redir_or_pipe->consumed;
|
||||
result.emplace(redir_or_pipe->type);
|
||||
result->redirected_fd = redir_or_pipe->fd;
|
||||
result->offset = start_pos;
|
||||
result->length = redir_or_pipe->consumed;
|
||||
this->buff += redir_or_pipe->consumed;
|
||||
} else {
|
||||
// Not a redirection or pipe, so just a string.
|
||||
@@ -568,15 +564,17 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
result.preceding_escaped_nl = preceding_escaped_nl;
|
||||
assert(result.has_value() && "Should have a token");
|
||||
result->preceding_escaped_nl = preceding_escaped_nl;
|
||||
return result;
|
||||
}
|
||||
|
||||
wcstring tok_first(const wcstring &str) {
|
||||
tokenizer_t t(str.c_str(), 0);
|
||||
tok_t token;
|
||||
if (t.next(&token) && token.type == TOK_STRING) {
|
||||
return t.text_of(token);
|
||||
if (auto token = t.next()) {
|
||||
if (token->type == token_type_t::string) {
|
||||
return t.text_of(*token);
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
@@ -10,17 +10,16 @@
|
||||
#include "parse_constants.h"
|
||||
|
||||
/// Token types.
|
||||
enum token_type {
|
||||
TOK_NONE, /// Tokenizer not yet constructed
|
||||
TOK_ERROR, /// Error reading token
|
||||
TOK_STRING, /// String token
|
||||
TOK_PIPE, /// Pipe token
|
||||
TOK_ANDAND, /// && token
|
||||
TOK_OROR, /// || token
|
||||
TOK_END, /// End token (semicolon or newline, not literal end)
|
||||
TOK_REDIRECT, /// redirection token
|
||||
TOK_BACKGROUND, /// send job to bg token
|
||||
TOK_COMMENT /// comment token
|
||||
enum class token_type_t {
|
||||
error, /// Error reading token
|
||||
string, /// String token
|
||||
pipe, /// Pipe token
|
||||
andand, /// && token
|
||||
oror, /// || token
|
||||
end, /// End token (semicolon or newline, not literal end)
|
||||
redirect, /// redirection token
|
||||
background, /// send job to bg token
|
||||
comment, /// comment token
|
||||
};
|
||||
|
||||
enum class redirection_type_t {
|
||||
@@ -65,7 +64,7 @@ const wchar_t *tokenizer_get_error_message(tokenizer_error_t err);
|
||||
|
||||
struct tok_t {
|
||||
// The type of the token.
|
||||
token_type type{TOK_NONE};
|
||||
token_type_t type;
|
||||
|
||||
// Offset of the token.
|
||||
size_t offset{0};
|
||||
@@ -85,7 +84,8 @@ struct tok_t {
|
||||
// at 'offset'.
|
||||
size_t error_offset{size_t(-1)};
|
||||
|
||||
tok_t() = default;
|
||||
// Construct from a token type.
|
||||
explicit tok_t(token_type_t type);
|
||||
};
|
||||
|
||||
/// The tokenizer struct.
|
||||
@@ -112,7 +112,6 @@ class tokenizer_t {
|
||||
tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start,
|
||||
const wchar_t *error_loc);
|
||||
tok_t read_string();
|
||||
maybe_t<tok_t> tok_next();
|
||||
|
||||
public:
|
||||
/// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
|
||||
@@ -124,8 +123,8 @@ class tokenizer_t {
|
||||
/// token. Setting TOK_SHOW_COMMENTS will return comments as tokens
|
||||
tokenizer_t(const wchar_t *b, tok_flags_t flags);
|
||||
|
||||
/// Returns the next token by reference. Returns true if we got one, false if we're at the end.
|
||||
bool next(struct tok_t *result);
|
||||
/// Returns the next token, or none() if we are at the end.
|
||||
maybe_t<tok_t> next();
|
||||
|
||||
/// Returns the text of a token, as a string.
|
||||
wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); }
|
||||
|
||||
Reference in New Issue
Block a user