diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e90030bb..02ad17654 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,7 +110,7 @@ set(FISH_SRCS src/null_terminated_array.cpp src/operation_context.cpp src/output.cpp src/pager.cpp src/parse_execution.cpp src/parse_tree.cpp src/parse_util.cpp src/parser.cpp src/parser_keywords.cpp src/path.cpp src/postfork.cpp - src/proc.cpp src/reader.cpp src/redirection.cpp src/screen.cpp + src/proc.cpp src/re.cpp src/reader.cpp src/redirection.cpp src/screen.cpp src/signal.cpp src/termsize.cpp src/timer.cpp src/tinyexpr.cpp src/tokenizer.cpp src/topic_monitor.cpp src/trace.cpp src/utf8.cpp src/util.cpp src/wait_handle.cpp src/wcstringutil.cpp src/wgetopt.cpp src/wildcard.cpp diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 96be9af0e..8cd0c69b4 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -71,6 +71,7 @@ #include "parser.h" #include "path.h" #include "proc.h" +#include "re.h" #include "reader.h" #include "redirection.h" #include "screen.h" @@ -6682,6 +6683,170 @@ static void test_killring() { do_test((kill_entries() == wcstring_list_t{L"a", L"c", L"b", L"d"})); } +namespace { +using namespace re; + +// Basic tests for re, which wraps PCRE2. +static void test_re_errs() { + say(L"Testing re"); + flags_t flags{}; + re_error_t error{}; + maybe_t re; + do_test(!regex_t::try_compile(L"abc[", flags, &error)); + do_test(error.code != 0); + do_test(!error.message().empty()); + + error = re_error_t{}; + do_test(!regex_t::try_compile(L"abc(", flags, &error).has_value()); + do_test(error.code != 0); + do_test(!error.message().empty()); +} + +static void test_re_basic() { + // Match a character twice. + using namespace re; + wcstring subject = L"AAbCCd11e"; + auto substr_from_range = [&](maybe_t r) { + do_test(r.has_value()); + do_test(r->begin <= r->end); + do_test(r->end <= subject.size()); + return subject.substr(r->begin, r->end - r->begin); + }; + auto re = regex_t::try_compile(L"(.)\\1"); + do_test(re.has_value()); + auto md = re->prepare(); + wcstring_list_t matches; + wcstring_list_t captures; + while (auto r = re->match(md, subject)) { + matches.push_back(substr_from_range(r)); + captures.push_back(substr_from_range(re->group(md, 1))); + do_test(!re->group(md, 2)); + } + do_test(join_strings(matches, L',') == L"AA,CC,11"); + do_test(join_strings(captures, L',') == L"A,C,1"); +} + +static void test_re_reset() { + using namespace re; + auto re = regex_t::try_compile(L"([0-9])"); + wcstring s = L"012345"; + auto md = re->prepare(); + for (size_t idx = 0; idx < s.size(); idx++) { + md.reset(); + for (size_t j = 0; j <= idx; j++) { + auto m = re->match(md, s); + match_range_t expected{j, j + 1}; + do_test(m == expected); + do_test(re->group(md, 1) == expected); + } + } +} + +static void test_re_named() { + // Named capture groups. + using namespace re; + auto re = regex_t::try_compile(L"A(?x+)?"); + do_test(re->capture_group_count() == 1); + + wcstring subject = L"AxxAAx"; + auto md = re->prepare(); + + auto r = re->match(md, subject); + do_test((r == match_range_t{0, 3})); + do_test(re->substring_for_group(md, L"QQQ", subject) == none()); + do_test(re->substring_for_group(md, L"FOO", subject) == L"xx"); + + r = re->match(md, subject); + do_test((r == match_range_t{3, 4})); + do_test(re->substring_for_group(md, L"QQQ", subject) == none()); + do_test(re->substring_for_group(md, L"FOO", subject) == none()); + + r = re->match(md, subject); + do_test((r == match_range_t{4, 6})); + do_test(re->substring_for_group(md, L"QQQ", subject) == none()); + do_test(re->substring_for_group(md, L"FOO", subject) == wcstring(L"x")); +} + +static void test_re_name_extraction() { + // Names of capture groups can be extracted. + using namespace re; + auto re = regex_t::try_compile(L"(?dd)ff(?cc)aaa(?)ff(?)"); + do_test(re.has_value()); + do_test(re->capture_group_count() == 4); + // PCRE2 returns these sorted. + do_test(join_strings(re->capture_group_names(), L',') == L"BAR,BETA,FOO,alpha"); + + // Mixed named and positional captures. + re = regex_t::try_compile(L"(abc)(?def)(ghi)(?jkl)"); + do_test(re.has_value()); + do_test(re->capture_group_count() == 4); + do_test(join_strings(re->capture_group_names(), L',') == L"BAR,FOO"); + auto md = re->prepare(); + const wcstring subject = L"abcdefghijkl"; + auto m = re->match(md, subject); + do_test((m == match_range_t{0, 12})); + do_test((re->group(md, 1) == match_range_t{0, 3})); + do_test((re->group(md, 2) == match_range_t{3, 6})); + do_test((re->group(md, 3) == match_range_t{6, 9})); + do_test((re->group(md, 4) == match_range_t{9, 12})); + do_test(re->substring_for_group(md, L"FOO", subject) == wcstring(L"def")); + do_test(re->substring_for_group(md, L"BAR", subject) == wcstring(L"jkl")); +} + +static void test_re_substitute() { + // Names of capture groups can be extracted. + using namespace re; + auto re = regex_t::try_compile(L"[a-z]+(\\d+)"); + do_test(re.has_value()); + do_test(re->capture_group_count() == 1); + maybe_t res{}; + int repl_count{}; + sub_flags_t sflags{}; + const wcstring subj = L"AAabc123ZZ AAabc123ZZ"; + const wcstring repl = L"$1qqq"; + res = re->substitute(subj, repl, sflags, 0, nullptr, &repl_count); + do_test(res && *res == L"AA123qqqZZ AAabc123ZZ"); + do_test(repl_count == 1); + + res = re->substitute(subj, repl, sflags, 5, nullptr, &repl_count); + do_test(res && *res == L"AAabc123ZZ AA123qqqZZ"); + do_test(repl_count == 1); + + sflags.global = true; + res = re->substitute(subj, repl, sflags, 0, nullptr, &repl_count); + do_test(res && *res == L"AA123qqqZZ AA123qqqZZ"); + do_test(repl_count == 2); + + sflags.literal = true; + res = re->substitute(subj, repl, sflags, 0, nullptr, &repl_count); + do_test(res && *res == L"AA$1qqqZZ AA$1qqqZZ"); + do_test(repl_count == 2); + + sflags.literal = false; + sflags.extended = true; + res = re->substitute(subj, L"\\x21", sflags, 0, nullptr, &repl_count); // \x21 = ! + do_test(res && *res == L"AA!ZZ AA!ZZ"); + do_test(repl_count == 2); + + // Test with a bad escape; \b is unsupported. + re_error_t error{}; + res = re->substitute(subj, L"AAA\\bZZZ", sflags, 0, &error); + do_test(!res.has_value()); + do_test(error.code == -57 /* PCRE2_ERROR_BADREPESCAPE */); + do_test(error.message() == L"bad escape sequence in replacement string"); + do_test(error.offset == 5 /* the b */); + + // Test a very long replacement as we used a fixed-size buffer. + sflags = sub_flags_t{}; + sflags.global = true; + re = regex_t::try_compile(L"A"); + res = + re->substitute(wcstring(4096, L'A'), wcstring(4096, L'X'), sflags, 0, nullptr, &repl_count); + do_test(res && *res == wcstring(4096 * 4096, L'X')); + do_test(repl_count == 4096); +} +} // namespace + struct termsize_tester_t { static void test(); }; @@ -6860,6 +7025,12 @@ static const test_t s_tests[]{ {TEST_GROUP("timer_format"), test_timer_format}, {TEST_GROUP("termsize"), termsize_tester_t::test}, {TEST_GROUP("killring"), test_killring}, + {TEST_GROUP("re"), test_re_errs}, + {TEST_GROUP("re"), test_re_basic}, + {TEST_GROUP("re"), test_re_reset}, + {TEST_GROUP("re"), test_re_named}, + {TEST_GROUP("re"), test_re_name_extraction}, + {TEST_GROUP("re"), test_re_substitute}, }; void list_tests() { diff --git a/src/re.cpp b/src/re.cpp new file mode 100644 index 000000000..1064fb70f --- /dev/null +++ b/src/re.cpp @@ -0,0 +1,288 @@ +#include "config.h" // IWYU pragma: keep + +#include "re.h" + +#include "flog.h" + +#define PCRE2_CODE_UNIT_WIDTH WCHAR_T_BITS +#ifdef _WIN32 +#define PCRE2_STATIC +#endif + +#include "pcre2.h" + +using namespace re; +using namespace re::adapters; + +void bytecode_deleter_t::operator()(const void *ptr) { + if (ptr) { + pcre2_code_free(static_cast(const_cast(ptr))); + } +} + +void match_data_deleter_t::operator()(void *ptr) { + if (ptr) { + pcre2_match_data_free(static_cast(ptr)); + } +} + +// Get underlying pcre2_code from a bytecode_ptr_t. +const pcre2_code *get_code(const bytecode_ptr_t &ptr) { + assert(ptr && "Null pointer"); + return static_cast(ptr.get()); +} + +// Get underlying match_data_t. +pcre2_match_data *get_md(const match_data_ptr_t &ptr) { + assert(ptr && "Null pointer"); + return static_cast(ptr.get()); +} + +// Convert a wcstring to a PCRE2_SPTR. +PCRE2_SPTR to_sptr(const wcstring &str) { return reinterpret_cast(str.c_str()); } + +/// \return a message for an error code. +static wcstring message_for_code(error_code_t code) { + wchar_t buf[128] = {}; + pcre2_get_error_message(code, reinterpret_cast(buf), + sizeof(buf) / sizeof(wchar_t)); + return buf; +} + +maybe_t regex_t::try_compile(const wcstring &pattern, const flags_t &flags, + re_error_t *error) { + // Disable some sequences that can lead to security problems. + uint32_t options = PCRE2_NEVER_UTF; +#if PCRE2_CODE_UNIT_WIDTH < 32 + options |= PCRE2_NEVER_BACKSLASH_C; +#endif + if (flags.icase) options |= PCRE2_CASELESS; + + error_code_t err_code = 0; + PCRE2_SIZE err_offset = 0; + pcre2_code *code = + pcre2_compile(to_sptr(pattern), pattern.size(), options, &err_code, &err_offset, nullptr); + if (!code) { + if (error) { + error->code = err_code; + error->offset = err_offset; + } + return none(); + } + return regex_t{bytecode_ptr_t(code)}; +} + +match_data_t regex_t::prepare() const { + pcre2_match_data *md = pcre2_match_data_create_from_pattern(get_code(code_), nullptr); + // Bogus assertion for memory exhaustion. + if (unlikely(!md)) { + DIE("Out of memory"); + } + return match_data_t{match_data_ptr_t(static_cast(md))}; +} + +void match_data_t::reset() { + start_offset = 0; + max_capture = 0; + last_empty = false; +} + +maybe_t regex_t::match(match_data_t &md, const wcstring &subject) const { + pcre2_match_data *const match_data = get_md(md.data); + assert(match_data && "Invalid match data"); + + // Handle exhausted matches. + if (md.start_offset > subject.size() || (md.last_empty && md.start_offset == subject.size())) { + md.max_capture = 0; + return none(); + } + PCRE2_SIZE start_offset = md.start_offset; + + // See pcre2demo.c for an explanation of this logic. + uint32_t options = md.last_empty ? PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED : 0; + error_code_t code = pcre2_match(get_code(code_), to_sptr(subject), subject.size(), start_offset, + options, match_data, nullptr); + if (code == PCRE2_ERROR_NOMATCH && !md.last_empty) { + // Failed to match. + md.start_offset = subject.size(); + md.max_capture = 0; + return none(); + } else if (code == PCRE2_ERROR_NOMATCH && md.last_empty) { + // Failed to find a non-empty-string match at a point where there was a previous + // empty-string match. Advance by one character and try again. + md.start_offset += 1; + md.last_empty = false; + return this->match(md, subject); + } else if (code < 0) { + FLOG(error, "pcre2_match unexpected error:", message_for_code(code)); + return none(); + } + + // Match succeeded. + // Start at end of previous match, marking if it was empty. + const auto *ovector = pcre2_get_ovector_pointer(match_data); + md.start_offset = ovector[1]; + md.max_capture = static_cast(code); + md.last_empty = ovector[0] == ovector[1]; + return match_range_t{ovector[0], ovector[1]}; +} + +maybe_t regex_t::group(const match_data_t &md, size_t group_idx) const { + if (group_idx >= md.max_capture || group_idx >= pcre2_get_ovector_count(get_md(md.data))) { + return none(); + } + + const PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(get_md(md.data)); + PCRE2_SIZE start = ovector[2 * group_idx]; + PCRE2_SIZE end = ovector[2 * group_idx + 1]; + if (start == PCRE2_UNSET || end == PCRE2_UNSET) { + return none(); + } + // From PCRE2 docs: "Note that when a pattern such as (?=ab\K) matches, the reported start of + // the match can be greater than the end of the match." + // Saturate the end. + end = std::max(start, end); + return match_range_t{start, end}; +} + +maybe_t regex_t::group(const match_data_t &match_data, const wcstring &name) const { + const auto *pcname = to_sptr(name); + // Beware, pcre2_substring_copy_byname and pcre2_substring_copy_bynumber both have a bug + // on at least one Ubuntu (running PCRE2) where it outputs garbage for the first character. + // Read out from the ovector directly. + int num = pcre2_substring_number_from_name(get_code(code_), pcname); + if (num <= 0) { + return none(); + } + return this->group(match_data, static_cast(num)); +} + +static maybe_t range_to_substr(const wcstring &subject, maybe_t range) { + if (!range) { + return none(); + } + assert(range->begin <= range->end && range->end <= subject.size() && "Invalid range"); + return subject.substr(range->begin, range->end - range->begin); +} + +maybe_t regex_t::substring_for_group(const match_data_t &md, size_t group_idx, + const wcstring &subject) const { + return range_to_substr(subject, this->group(md, group_idx)); +} + +maybe_t regex_t::substring_for_group(const match_data_t &md, const wcstring &name, + const wcstring &subject) const { + return range_to_substr(subject, this->group(md, name)); +} + +size_t regex_t::capture_group_count() const { + uint32_t count{}; + pcre2_pattern_info(get_code(code_), PCRE2_INFO_CAPTURECOUNT, &count); + return count; +} + +wcstring_list_t regex_t::capture_group_names() const { + PCRE2_SPTR name_table{}; + uint32_t name_entry_size{}; + uint32_t name_count{}; + + const auto *code = get_code(code_); + pcre2_pattern_info(code, PCRE2_INFO_NAMETABLE, &name_table); + pcre2_pattern_info(code, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size); + pcre2_pattern_info(code, PCRE2_INFO_NAMECOUNT, &name_count); + + struct name_table_entry_t { +#if PCRE2_CODE_UNIT_WIDTH == 8 + uint8_t match_index_msb; + uint8_t match_index_lsb; +#if CHAR_BIT == PCRE2_CODE_UNIT_WIDTH + char name[]; +#else + char8_t name[]; +#endif +#elif PCRE2_CODE_UNIT_WIDTH == 16 + uint16_t match_index; +#if WCHAR_T_BITS == PCRE2_CODE_UNIT_WIDTH + wchar_t name[]; +#else + char16_t name[]; +#endif +#else + uint32_t match_index; +#if WCHAR_T_BITS == PCRE2_CODE_UNIT_WIDTH + wchar_t name[]; +#else + char32_t name[]; +#endif // WCHAR_T_BITS +#endif // PCRE2_CODE_UNIT_WIDTH + }; + + const auto *names = reinterpret_cast(name_table); + wcstring_list_t result; + result.reserve(name_count); + for (uint32_t i = 0; i < name_count; ++i) { + const auto &name_entry = names[i * name_entry_size]; + result.emplace_back(name_entry.name); + } + return result; +} + +maybe_t regex_t::substitute(const wcstring &subject, const wcstring &replacement, + sub_flags_t flags, size_t start_idx, re_error_t *out_error, + int *out_repl_count) const { + constexpr size_t stack_bufflen = 256; + wchar_t buffer[stack_bufflen]; + + // SUBSTITUTE_GLOBAL means more than one substitution happens. + uint32_t options = PCRE2_SUBSTITUTE_UNSET_EMPTY // don't error on unmatched + | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH // return required length on overflow + | (flags.global ? PCRE2_SUBSTITUTE_GLOBAL : 0) // replace multiple + | (flags.literal ? PCRE2_SUBSTITUTE_LITERAL : 0) // respect $1, etc. + | (flags.extended ? PCRE2_SUBSTITUTE_EXTENDED : 0) // backslash escapes + ; + size_t bufflen = stack_bufflen; + error_code_t rc = + pcre2_substitute(get_code(code_), to_sptr(subject), subject.size(), start_idx, options, + nullptr /* match_data */, nullptr /* context */, to_sptr(replacement), + replacement.size(), reinterpret_cast(buffer), &bufflen); + + if (out_repl_count) { + *out_repl_count = std::max(rc, 0); + } + if (rc == 0) { + // No replacements. + return subject; + } else if (rc > 0) { + // Some replacement which fit in our buffer. + // Note we may have had embedded nuls. + assert(bufflen <= stack_bufflen && "bufflen should not exceed buffer size"); + return wcstring(buffer, bufflen); + } else if (rc == PCRE2_ERROR_NOMEMORY) { + // bufflen has been updated to required buffer size. + // Try again with a real string. + wcstring res(bufflen, L'\0'); + rc = pcre2_substitute(get_code(code_), to_sptr(subject), subject.size(), start_idx, options, + nullptr /* match_data */, nullptr /* context */, to_sptr(replacement), + replacement.size(), reinterpret_cast(&res[0]), + &bufflen); + if (out_repl_count) { + *out_repl_count = std::max(rc, 0); + } + if (rc >= 0) { + res.resize(bufflen); + return res; + } + } + // Some error. The offset may be returned in the bufflen. + if (out_error) { + out_error->code = rc; + out_error->offset = (bufflen == PCRE2_UNSET ? 0 : bufflen); + } + return none(); +} + +regex_t::regex_t(adapters::bytecode_ptr_t &&code) : code_(std::move(code)) { + assert(code_ && "Null impl"); +} + +wcstring re_error_t::message() const { return message_for_code(this->code); } diff --git a/src/re.h b/src/re.h new file mode 100644 index 000000000..02848bcb6 --- /dev/null +++ b/src/re.h @@ -0,0 +1,146 @@ +// Wraps PCRE2. +#ifndef FISH_RE_H +#define FISH_RE_H + +#include "common.h" +#include "maybe.h" + +namespace re { + +namespace adapters { +// Adapter to store pcre2_code in unique_ptr. +struct bytecode_deleter_t { + void operator()(const void *); +}; +using bytecode_ptr_t = std::unique_ptr; + +// Adapter to store pcre2_match_data in unique_ptr. +struct match_data_deleter_t { + void operator()(void *); +}; +using match_data_ptr_t = std::unique_ptr; +} // namespace adapters + +/// Error code type alias. +using error_code_t = int; + +/// Flags for compiling a regex. +struct flags_t { + bool icase{}; // ignore case? +}; + +/// Flags for substituting a regex. +struct sub_flags_t { + bool global{}; // perform multiple substitutions? + bool literal{}; // $1 is literal, not a capture reference + bool extended{}; // apply PCRE2 extended backslash escapes? +}; + +/// A type wrapping up error information. +/// Beware, GNU defines error_t; hence we use an re_ prefix again. +struct re_error_t { + error_code_t code{}; // error code + size_t offset{}; // offset of the error in the pattern + + /// \return our error message. + wcstring message() const; +}; + +/// A half-open range of a subject which matched. +struct match_range_t { + size_t begin; + size_t end; + + bool operator==(match_range_t rhs) const { return begin == rhs.begin && end == rhs.end; } + bool operator!=(match_range_t rhs) const { return !(*this == rhs); } +}; + +/// A match data is the "stateful" object, storing string indices for where to start the next match, +/// capture results, etc. Create one via regex_t::prepare(). These are tied to the regex which +/// created them. +class match_data_t : noncopyable_t { + public: + match_data_t(match_data_t &&) = default; + match_data_t &operator=(match_data_t &&) = default; + ~match_data_t() = default; + + /// \return a "count" of the number of capture groups which matched. + /// This is really one more than the highest matching group. + /// 0 is considered a "group" for the entire match, so this will always return at least 1 for a + /// successful match. + size_t matched_capture_group_count() const { return max_capture; } + + /// Reset this data, as if this were freshly issued by a call to prepare(). + void reset(); + + private: + explicit match_data_t(adapters::match_data_ptr_t &&data) : data(std::move(data)) {} + + // Next start position. This may exceed the needle length, which indicates exhaustion. + size_t start_offset{0}; + + // One more than the highest numbered capturing pair that was set (e.g. 1 if no captures). + size_t max_capture{0}; + + // If set, the last match was empty. + bool last_empty{false}; + + // Underlying pcre2_match_data. + adapters::match_data_ptr_t data{}; + + friend class regex_t; +}; + +/// The compiled form of a PCRE2 regex. +/// This is thread safe. +class regex_t : noncopyable_t { + public: + /// Compile a pattern into a regex. \return the resulting regex, or none on error. + /// If \p error is not null, populate it with the error information. + static maybe_t try_compile(const wcstring &pattern, const flags_t &flags = flags_t{}, + re_error_t *out_error = nullptr); + + /// Create a match data for this regex. + /// The result is tied to this regex; it should not be used for others. + match_data_t prepare() const; + + /// Match against a string \p subject, populating \p md. + /// \return a range on a successful match, none on no match. + maybe_t match(match_data_t &md, const wcstring &subject) const; + + /// \return the matched range for an indexed or named capture group. 0 means the entire match. + maybe_t group(const match_data_t &md, size_t group_idx) const; + maybe_t group(const match_data_t &md, const wcstring &name) const; + + /// \return the matched substring for a capture group. + maybe_t substring_for_group(const match_data_t &md, size_t group_idx, + const wcstring &subject) const; + maybe_t substring_for_group(const match_data_t &md, const wcstring &name, + const wcstring &subject) const; + + /// \return the number of indexed capture groups. + size_t capture_group_count() const; + + /// \return the list of capture group names. + /// Note PCRE provides these in sorted order, not specification order. + wcstring_list_t capture_group_names() const; + + /// Search \p subject for matches for this regex, starting at \p start_idx, and replacing them + /// with \p replacement. If \p repl_count is not null, populate it with the number of + /// replacements which occurred. This may fail for e.g. bad escapes in the replacement string. + maybe_t substitute(const wcstring &subject, const wcstring &replacement, + sub_flags_t flags, size_t start_idx = 0, + re_error_t *out_error = nullptr, + int *out_repl_count = nullptr) const; + + regex_t(regex_t &&other) = default; + regex_t &operator=(regex_t &&) = default; + ~regex_t() = default; + + private: + regex_t(adapters::bytecode_ptr_t &&); + adapters::bytecode_ptr_t code_; +}; + +} // namespace re +#endif