Adopt the new re in builtin_string

This switches builtin_string from using PCRE2 directly, to using the new re
component. This simplifies some code and removes redundancy.

No user-visible behavior change expected here.
This commit is contained in:
ridiculousfish
2022-07-09 11:08:36 -07:00
parent 7ae1727359
commit d46f402cea

View File

@@ -1,10 +1,6 @@
// Implementation of the string builtin.
#include "config.h" // IWYU pragma: keep
#define PCRE2_CODE_UNIT_WIDTH WCHAR_T_BITS
#ifdef _WIN32
#define PCRE2_STATIC
#endif
#include <algorithm>
#include <cerrno>
#include <climits>
@@ -29,12 +25,12 @@
#include "../future_feature_flags.h"
#include "../parse_util.h"
#include "../parser.h"
#include "../re.h"
#include "../screen.h"
#include "../wcstringutil.h"
#include "../wgetopt.h"
#include "../wildcard.h"
#include "../wutil.h" // IWYU pragma: keep
#include "pcre2.h"
// How many bytes we read() at once.
// Bash uses 128 here, so we do too (see READ_CHUNK_SIZE).
@@ -852,18 +848,15 @@ namespace {
class string_matcher_t {
protected:
const options_t opts;
io_streams_t &streams;
int total_matched{0};
public:
string_matcher_t(options_t opts_, io_streams_t &streams_)
: opts(std::move(opts_)), streams(streams_) {}
explicit string_matcher_t(const options_t &opts_) : opts(opts_) {}
virtual ~string_matcher_t() = default;
virtual bool report_matches(const wcstring &arg) = 0;
virtual bool report_matches(const wcstring &arg, io_streams_t &streams) = 0;
int match_count() const { return total_matched; }
virtual bool is_valid() const = 0;
virtual void clear_capture_vars() {}
};
@@ -872,9 +865,8 @@ class wildcard_matcher_t final : public string_matcher_t {
wcstring wcpattern;
public:
wildcard_matcher_t(const wchar_t * /*argv0*/, const wcstring &pattern, const options_t &opts,
io_streams_t &streams)
: string_matcher_t(opts, streams), wcpattern(parse_util_unescape_wildcards(pattern)) {
wildcard_matcher_t(const wcstring &pattern, const options_t &opts)
: string_matcher_t(opts), wcpattern(parse_util_unescape_wildcards(pattern)) {
if (opts.ignore_case) {
wcpattern = wcstolower(std::move(wcpattern));
}
@@ -891,7 +883,7 @@ class wildcard_matcher_t final : public string_matcher_t {
~wildcard_matcher_t() override = default;
bool report_matches(const wcstring &arg) override {
bool report_matches(const wcstring &arg, io_streams_t &streams) override {
// Note: --all is a no-op for glob matching since the pattern is always matched
// against the entire argument.
bool match;
@@ -915,141 +907,53 @@ class wildcard_matcher_t final : public string_matcher_t {
}
return true;
}
bool is_valid() const override { return true; }
};
static wcstring pcre2_strerror(int err_code) {
wchar_t buf[128];
pcre2_get_error_message(err_code, reinterpret_cast<PCRE2_UCHAR *>(buf),
sizeof(buf) / sizeof(wchar_t));
return buf;
// Compile a regex, printing an error on failure.
static maybe_t<re::regex_t> try_compile_regex(const wcstring &pattern, const options_t &opts,
const wchar_t *cmd, io_streams_t &streams) {
re::re_error_t error{};
re::flags_t flags{};
flags.icase = opts.ignore_case;
auto re = re::regex_t::try_compile(pattern, flags, &error);
if (!re) {
string_error(streams, _(L"%ls: Regular expression compile error: %ls\n"), cmd,
error.message().c_str());
string_error(streams, L"%ls: %ls\n", cmd, pattern.c_str());
string_error(streams, L"%ls: %*ls\n", cmd, static_cast<int>(error.offset), L"^");
}
return re;
}
struct compiled_regex_t : noncopyable_t {
pcre2_code *code{nullptr};
pcre2_match_data *match{nullptr};
// The list of named capture groups.
wcstring_list_t capture_group_names;
compiled_regex_t(const wchar_t *argv0, const wcstring &pattern, bool ignore_case,
io_streams_t &streams) {
// Disable some sequences that can lead to security problems.
uint32_t options = PCRE2_NEVER_UTF;
#if PCRE2_CODE_UNIT_WIDTH < 32
options |= PCRE2_NEVER_BACKSLASH_C;
#endif
int err_code = 0;
PCRE2_SIZE err_offset = 0;
code = pcre2_compile(PCRE2_SPTR(pattern.c_str()), pattern.length(),
options | (ignore_case ? PCRE2_CASELESS : 0), &err_code, &err_offset,
nullptr);
if (code == nullptr) {
string_error(streams, _(L"%ls: Regular expression compile error: %ls\n"), argv0,
pcre2_strerror(err_code).c_str());
string_error(streams, L"%ls: %ls\n", argv0, pattern.c_str());
string_error(streams, L"%ls: %*ls\n", argv0, static_cast<int>(err_offset), L"^");
return;
/// Check if a list of capture group names is valid for variables. If any are invalid then report an
/// error to \p streams. \return true if all names are valid.
static bool validate_capture_group_names(const wcstring_list_t &capture_group_names,
io_streams_t &streams) {
for (const wcstring &name : capture_group_names) {
if (env_var_t::flags_for(name.c_str()) & env_var_t::flag_read_only) {
streams.err.append_format(
L"Modification of read-only variable \"%ls\" is not allowed\n", name.c_str());
return false;
}
this->capture_group_names = get_capture_group_names(code);
if (!validate_capture_group_names(streams)) {
return;
}
match = pcre2_match_data_create_from_pattern(code, nullptr);
assert(match);
this->valid_ = true;
}
/// \return the list of capture group names from \p code.
static wcstring_list_t get_capture_group_names(const pcre2_code *code) {
PCRE2_SPTR name_table;
uint32_t name_entry_size;
uint32_t name_count;
pcre2_pattern_info(code, PCRE2_INFO_NAMETABLE, &name_table);
pcre2_pattern_info(code, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size);
pcre2_pattern_info(code, PCRE2_INFO_NAMECOUNT, &name_count);
struct name_table_entry_t {
#if PCRE2_CODE_UNIT_WIDTH == 8
uint8_t match_index_msb;
uint8_t match_index_lsb;
#if CHAR_BIT == PCRE2_CODE_UNIT_WIDTH
char name[];
#else
char8_t name[];
#endif
#elif PCRE2_CODE_UNIT_WIDTH == 16
uint16_t match_index;
#if WCHAR_T_BITS == PCRE2_CODE_UNIT_WIDTH
wchar_t name[];
#else
char16_t name[];
#endif
#else
uint32_t match_index;
#if WCHAR_T_BITS == PCRE2_CODE_UNIT_WIDTH
wchar_t name[];
#else
char32_t name[];
#endif // WCHAR_T_BITS
#endif // PCRE2_CODE_UNIT_WIDTH
};
const auto *names = reinterpret_cast<const name_table_entry_t *>(name_table);
wcstring_list_t result;
result.reserve(name_count);
for (uint32_t i = 0; i < name_count; ++i) {
const auto &name_entry = names[i * name_entry_size];
result.emplace_back(name_entry.name);
}
return result;
}
/// Check if our capture group names are valid. If any are invalid then report an error to \p
/// streams. \return true if all names are valid.
bool validate_capture_group_names(io_streams_t &streams) {
for (const wcstring &name : this->capture_group_names) {
if (env_var_t::flags_for(name.c_str()) & env_var_t::flag_read_only) {
// Modification of read-only variables is not allowed
streams.err.append_format(
L"Modification of read-only variable \"%ls\" is not allowed\n", name.c_str());
return false;
}
}
return true;
}
~compiled_regex_t() {
pcre2_match_data_free(match);
pcre2_code_free(code);
}
bool is_valid() const { return this->valid_; }
private:
bool valid_{false};
};
return true;
}
class pcre2_matcher_t final : public string_matcher_t {
const wchar_t *argv0;
compiled_regex_t regex;
parser_t &parser;
using regex_t = re::regex_t;
using match_range_t = re::match_range_t;
regex_t regex;
env_stack_t &vars;
bool imported_vars = false;
enum class match_result_t {
pcre2_error = -1,
no_match = 0,
match = 1,
};
match_result_t report_match(const wcstring &arg, int pcre2_rc) {
if (pcre2_rc == PCRE2_ERROR_NOMATCH) {
match_result_t report_match(const wcstring &arg, const re::match_data_t &md,
maybe_t<match_range_t> mrange, io_streams_t &streams) const {
if (!mrange.has_value()) {
if (opts.invert_match && !opts.quiet) {
if (opts.index) {
streams.out.append_format(L"1 %lu\n", arg.length());
@@ -1060,14 +964,6 @@ class pcre2_matcher_t final : public string_matcher_t {
}
return opts.invert_match ? match_result_t::match : match_result_t::no_match;
} else if (pcre2_rc < 0) {
string_error(streams, _(L"%ls: Regular expression match error: %ls\n"), argv0,
pcre2_strerror(pcre2_rc).c_str());
return match_result_t::pcre2_error;
} else if (pcre2_rc == 0) {
// The output vector wasn't big enough. Should not happen.
string_error(streams, _(L"%ls: Regular expression internal error\n"), argv0);
return match_result_t::pcre2_error;
} else if (opts.invert_match) {
return match_result_t::no_match;
}
@@ -1077,18 +973,15 @@ class pcre2_matcher_t final : public string_matcher_t {
streams.out.push_back(L'\n');
}
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(regex.match);
// If we have groups-only, we skip the first match, which is the full one.
for (int j = (opts.entire || opts.groups_only ? 1 : 0); j < pcre2_rc; j++) {
PCRE2_SIZE begin = ovector[2 * j];
PCRE2_SIZE end = ovector[2 * j + 1];
if (begin != PCRE2_UNSET && end != PCRE2_UNSET && !opts.quiet) {
size_t group_count = md.matched_capture_group_count();
for (size_t j = (opts.entire || opts.groups_only ? 1 : 0); j < group_count; j++) {
maybe_t<match_range_t> cg = this->regex.group(md, j);
if (cg.has_value() && !opts.quiet) {
if (opts.index) {
streams.out.append_format(L"%lu %lu", (begin + 1), (end - begin));
} else if (end > begin) {
// May have end < begin if \K is used.
streams.out.append(arg.substr(begin, end - begin));
streams.out.append_format(L"%lu %lu", cg->begin + 1, cg->end - cg->begin);
} else {
streams.out.append(arg.substr(cg->begin, cg->end - cg->begin));
}
streams.out.push_back(L'\n');
}
@@ -1099,62 +992,37 @@ class pcre2_matcher_t final : public string_matcher_t {
class regex_importer_t {
private:
// map from group name to its matched substrings.
std::map<wcstring, wcstring_list_t> matches_;
env_stack_t &vars_;
const wcstring &haystack_;
const compiled_regex_t &regex_;
const re::regex_t &regex_;
const bool all_flag_;
bool do_import_{false};
public:
regex_importer_t(env_stack_t &vars, const wcstring &haystack, const compiled_regex_t &regex,
bool all_flag)
: vars_(vars), haystack_(haystack), regex_(regex), all_flag_(all_flag) {
for (const wcstring &name : regex_.capture_group_names) {
regex_importer_t(env_stack_t &vars, const re::regex_t &regex, bool all_flag)
: vars_(vars), regex_(regex), all_flag_(all_flag) {
for (const wcstring &name : regex_.capture_group_names()) {
matches_.emplace(name, wcstring_list_t{});
}
}
/// This member function should be called each time a match is found
void import_vars() {
void import_vars(const re::match_data_t &md, const wcstring &subject) {
do_import_ = true;
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(regex_.match);
for (auto &kv : matches_) {
const auto &name = kv.first;
const wcstring &name = kv.first;
wcstring_list_t &vals = kv.second;
// A named group may actually correspond to multiple group numbers, each of which
// might have to be enumerated.
PCRE2_SPTR first = nullptr;
PCRE2_SPTR last = nullptr;
int entry_size = pcre2_substring_nametable_scan(
regex_.code, (PCRE2_SPTR)(name.c_str()), &first, &last);
if (entry_size <= 0) {
FLOGF(warning, L"PCRE2 failure retrieving named matches");
continue;
}
bool value_found = false;
for (const auto *group_ptr = first; group_ptr <= last; group_ptr += entry_size) {
int group_num = group_ptr[0];
PCRE2_SIZE *capture = ovector + (2 * group_num);
PCRE2_SIZE begin = capture[0];
PCRE2_SIZE end = capture[1];
if (begin != PCRE2_UNSET && end != PCRE2_UNSET && end >= begin) {
vals.push_back(haystack_.substr(begin, end - begin));
value_found = true;
break;
}
}
// If there are multiple named groups and --all was used, we need to ensure that the
// indexes are always in sync between the variables. If an optional named group
// didn't match but its brethren did, we need to make sure to put *something* in the
// resulting array, and unfortunately fish doesn't support empty/null members so
// we're going to have to use an empty string as the sentinel value.
if (!value_found && all_flag_) {
// If there are multiple named groups and --all was used, we need to ensure that
// the indexes are always in sync between the variables. If an optional named
// group didn't match but its brethren did, we need to make sure to put
// *something* in the resulting array, and unfortunately fish doesn't support
// empty/null members so we're going to have to use an empty string as the
// sentinel value.
if (maybe_t<wcstring> capture = regex_.substring_for_group(md, name, subject)) {
vals.push_back(capture.acquire());
} else if (all_flag_) {
vals.emplace_back();
}
}
@@ -1171,37 +1039,26 @@ class pcre2_matcher_t final : public string_matcher_t {
};
public:
pcre2_matcher_t(const wchar_t *argv0_, const wcstring &pattern, const options_t &opts,
io_streams_t &streams, parser_t &parser_)
: string_matcher_t(opts, streams),
argv0(argv0_),
regex(argv0_, pattern, opts.ignore_case, streams),
parser(parser_) {}
pcre2_matcher_t(regex_t regex, const options_t &opts, env_stack_t &vars)
: string_matcher_t(opts), regex(std::move(regex)), vars(vars) {}
~pcre2_matcher_t() override = default;
bool report_matches(const wcstring &arg) override {
// A return value of true means all is well (even if no matches were found), false indicates
// an unrecoverable error.
assert(regex.code && "report_matches should only be called if the regex was valid");
bool report_matches(const wcstring &arg, io_streams_t &streams) override {
using namespace re;
regex_importer_t var_importer(vars, this->regex, opts.all);
regex_importer_t var_importer(this->parser.vars(), arg, this->regex, opts.all);
// See pcre2demo.c for an explanation of this logic.
PCRE2_SIZE arglen = arg.length();
auto rc = report_match(arg, pcre2_match(regex.code, PCRE2_SPTR(arg.c_str()), arglen, 0, 0,
regex.match, nullptr));
match_data_t md = this->regex.prepare();
auto rc = report_match(arg, md, this->regex.match(md, arg), streams);
// We only import variables for the *first matching argument*
bool do_var_import = (rc == match_result_t::match && !imported_vars);
if (do_var_import) {
var_importer.import_vars();
var_importer.import_vars(md, arg);
imported_vars = true;
}
switch (rc) {
case match_result_t::pcre2_error:
return false;
case match_result_t::no_match:
return true;
case match_result_t::match:
@@ -1211,31 +1068,12 @@ class pcre2_matcher_t final : public string_matcher_t {
if (opts.invert_match) return true;
// Report any additional matches.
for (auto *ovector = pcre2_get_ovector_pointer(regex.match); opts.all; total_matched++) {
uint32_t options = 0;
PCRE2_SIZE offset = ovector[1]; // start at end of previous match
if (ovector[0] == ovector[1]) {
if (ovector[0] == arglen) break;
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
rc = report_match(arg, pcre2_match(regex.code, PCRE2_SPTR(arg.c_str()), arglen, offset,
options, regex.match, nullptr));
if (rc == match_result_t::pcre2_error) {
// This shouldn't happen as we've already validated the regex above
return false;
}
// Call import_vars() before modifying the ovector
if (rc == match_result_t::match && do_var_import) {
var_importer.import_vars();
}
if (rc == match_result_t::no_match) {
if (options == 0 /* all matches found now */) break;
ovector[1] = offset + 1;
if (opts.all) {
while (auto mr = this->regex.match(md, arg)) {
auto rc = this->report_match(arg, md, mr, streams);
if (rc == match_result_t::match && do_var_import) {
var_importer.import_vars(md, arg);
}
}
}
return true;
@@ -1244,12 +1082,10 @@ class pcre2_matcher_t final : public string_matcher_t {
/// Override to clear our capture variables if we had no match.
void clear_capture_vars() override {
assert(!imported_vars && "Should not already have imported variables");
for (const wcstring &name : regex.capture_group_names) {
parser.vars().set_empty(name, ENV_DEFAULT);
for (const wcstring &name : regex.capture_group_names()) {
vars.set_empty(name, ENV_DEFAULT);
}
}
bool is_valid() const override { return regex.is_valid(); }
};
} // namespace
@@ -1289,19 +1125,24 @@ static int string_match(parser_t &parser, io_streams_t &streams, int argc, const
}
std::unique_ptr<string_matcher_t> matcher;
if (opts.regex) {
matcher = make_unique<pcre2_matcher_t>(cmd, pattern, opts, streams, parser);
if (!opts.regex) {
// Globs cannot fail.
matcher = make_unique<wildcard_matcher_t>(pattern, opts);
} else {
matcher = make_unique<wildcard_matcher_t>(cmd, pattern, opts, streams);
}
if (!matcher->is_valid()) {
// An error will have been printed by the constructor.
return STATUS_INVALID_ARGS;
// Compile the pattern as regex and validate capture group names as variables; both may
// fail. Note both try_compile_regex and validate_capture_group_names print an error on
// failure.
auto re = try_compile_regex(pattern, opts, cmd, streams);
if (!re || !validate_capture_group_names(re->capture_group_names(), streams)) {
return STATUS_INVALID_ARGS;
}
matcher = make_unique<pcre2_matcher_t>(re.acquire(), opts, parser.vars());
}
assert(matcher && "Should have a matcher");
arg_iterator_t aiter(argv, optind, streams);
while (const wcstring *arg = aiter.nextstr()) {
if (!matcher->report_matches(*arg)) {
if (!matcher->report_matches(*arg, streams)) {
return STATUS_INVALID_ARGS;
}
if (opts.quiet && matcher->match_count() > 0) return STATUS_CMD_OK;
@@ -1428,14 +1269,13 @@ static maybe_t<wcstring> interpret_escapes(const wcstring &arg) {
}
class regex_replacer_t : public string_replacer_t {
compiled_regex_t regex;
re::regex_t regex;
maybe_t<wcstring> replacement;
public:
regex_replacer_t(const wchar_t *argv0, const wcstring &pattern, const wcstring &replacement_,
regex_replacer_t(const wchar_t *argv0, re::regex_t regex, const wcstring &replacement_,
const options_t &opts, io_streams_t &streams)
: string_replacer_t(argv0, opts, streams),
regex(argv0, pattern, opts.ignore_case, streams) {
: string_replacer_t(argv0, opts, streams), regex(std::move(regex)) {
if (feature_test(features_t::string_replace_backslash)) {
replacement = replacement_;
} else {
@@ -1486,62 +1326,32 @@ bool literal_replacer_t::replace_matches(const wcstring &arg, bool want_newline)
/// A return value of true means all is well (even if no replacements were performed), false
/// indicates an unrecoverable error.
bool regex_replacer_t::replace_matches(const wcstring &arg, bool want_newline) {
if (!regex.code) return false; // pcre2_compile() failed
using namespace re;
if (!replacement) return false; // replacement was an invalid string
// clang-format off
// SUBSTITUTE_OVERFLOW_LENGTH causes pcre to return the needed buffer length if the passed one is to small
// SUBSTITUTE_EXTENDED changes how substitution expressions are interpreted (`$` as the special character)
// SUBSTITUTE_UNSET_EMPTY treats unmatched capturing groups as empty instead of erroring.
// SUBSTITUTE_GLOBAL means more than one substitution happens.
// clang-format on
uint32_t options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED |
PCRE2_SUBSTITUTE_UNSET_EMPTY | (opts.all ? PCRE2_SUBSTITUTE_GLOBAL : 0);
size_t arglen = arg.length();
PCRE2_SIZE bufsize = (arglen == 0) ? 16 : 2 * arglen;
auto output = static_cast<wchar_t *>(malloc(sizeof(wchar_t) * bufsize));
int pcre2_rc;
PCRE2_SIZE outlen = bufsize;
sub_flags_t sflags{};
sflags.global = opts.all;
sflags.extended = true;
bool done = false;
while (!done) {
assert(output);
re_error_t error{};
int repl_count{};
maybe_t<wcstring> result =
this->regex.substitute(arg, *replacement, sflags, 0, &error, &repl_count);
pcre2_rc = pcre2_substitute(regex.code, PCRE2_SPTR(arg.c_str()), arglen,
0, // start offset
options, regex.match,
nullptr, // match_data
PCRE2_SPTR(replacement->c_str()), replacement->length(),
reinterpret_cast<PCRE2_UCHAR *>(output), &outlen);
if (pcre2_rc != PCRE2_ERROR_NOMEMORY || bufsize >= outlen) {
done = true;
} else {
bufsize = outlen;
auto new_output = static_cast<wchar_t *>(realloc(output, sizeof(wchar_t) * bufsize));
if (new_output) output = new_output;
}
}
bool rc = true;
if (pcre2_rc < 0) {
if (!result) {
string_error(streams, _(L"%ls: Regular expression substitute error: %ls\n"), argv0,
pcre2_strerror(pcre2_rc).c_str());
rc = false;
error.message().c_str());
} else {
wcstring outstr(output, outlen);
bool replacement_occurred = pcre2_rc > 0;
bool replacement_occurred = repl_count > 0;
if (!opts.quiet && (!opts.filter || replacement_occurred)) {
streams.out.append(outstr);
streams.out.append(*result);
if (want_newline) {
streams.out.append(L'\n');
}
}
total_replaced += pcre2_rc;
total_replaced += repl_count;
}
free(output);
return rc;
return result.has_value();
}
static int string_replace(parser_t &parser, io_streams_t &streams, int argc, const wchar_t **argv) {
@@ -1560,7 +1370,13 @@ static int string_replace(parser_t &parser, io_streams_t &streams, int argc, con
std::unique_ptr<string_replacer_t> replacer;
if (opts.regex) {
replacer = make_unique<regex_replacer_t>(argv[0], pattern, replacement, opts, streams);
if (auto re = try_compile_regex(pattern, opts, argv[0], streams)) {
replacer =
make_unique<regex_replacer_t>(argv[0], re.acquire(), replacement, opts, streams);
} else {
// try_compile_regex prints an error.
return STATUS_INVALID_ARGS;
}
} else {
replacer = make_unique<literal_replacer_t>(argv[0], pattern, replacement, opts, streams);
}